set.seed(params$seed)

1 Install packages

1.1 Required libraries

library(funModeling) 
library(tidyverse) 
library(Hmisc)
library(biomaRt)
library(survminer)
library(glmnet)
library(glmSparseNet)
library(PRROC)
library(propagate)
library(lsa)
library(edgeR)
library(limma)
library(Glimma)
library(gplots)
library(DESeq2)
library(RColorBrewer)
library(GEOquery)
library(tibble)
library ( DESeq2 )
library(NMF)
library(ISLR)
library(tree)
library(readxl)
# library(ggbiplot)
library(caret)
library(rpart)
library(rpart.plot)
library(futile.logger)
library(ggpubr)
library(rstatix)
library(writexl)
library(pROC)

2 Data

2.1 RnaSeq

  • RNAseq data from CRC patients provided by Hospital de Santa Maria (Lisbon)
rnaseq1 <- read_excel("data/rnaseq_LCosta.xlsx")
rnaseq1 <- as.data.frame(rnaseq1)

ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
hgnc_swissprot <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol'),filters = 'ensembl_gene_id', values = rnaseq1$...1, mart = ensembl)
hgnc_swissprot[1:3,1:2]
##   ensembl_gene_id hgnc_symbol
## 1 ENSG00000000003      TSPAN6
## 2 ENSG00000000005        TNMD
## 3 ENSG00000000419        DPM1
hgnc_swissprot <- hgnc_swissprot[!duplicated(hgnc_swissprot$ensembl_gene_id), ]


rnaseq1 <- rnaseq1[rnaseq1$...1 %in% 
                              hgnc_swissprot$ensembl_gene_id,]

rnaseq1$genes <- hgnc_swissprot$hgnc_symbol
rnaseq1 <- rnaseq1[!duplicated(rnaseq1$genes), ]
which(is.na(rnaseq1[,101]))
## integer(0)
# rnaseq <- rnaseq[-38532,]
rownames(rnaseq1) <- rnaseq1$genes
rnaseq1 <- rnaseq1[,-c(1,101)]
rnaseq1 <- t(rnaseq1)
which(colnames(rnaseq1)=="")
## [1] 1599
rnaseq1 <- rnaseq1[,-1599]
rnaseq1 <- as.data.frame(rnaseq1)
rnaseq1$row <- rownames(rnaseq1)
dim(rnaseq1)
## [1]    99 39475
rnaseq2 <- read_excel("data/rnaseq_illumina.xlsx")
rnaseq2 <- as.data.frame(rnaseq2)

ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
hgnc_swissprot <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol'),filters = 'ensembl_gene_id', values = rnaseq2$...1, mart = ensembl)
hgnc_swissprot[1:3,1:2]
##   ensembl_gene_id hgnc_symbol
## 1 ENSG00000000003      TSPAN6
## 2 ENSG00000000005        TNMD
## 3 ENSG00000000419        DPM1
hgnc_swissprot <- hgnc_swissprot[!duplicated(hgnc_swissprot$ensembl_gene_id), ]


rnaseq2 <- rnaseq2[rnaseq2$...1 %in% 
                              hgnc_swissprot$ensembl_gene_id,]

rnaseq2$genes <- hgnc_swissprot$hgnc_symbol
rnaseq2 <- rnaseq2[!duplicated(rnaseq2$genes), ]
which(is.na(rnaseq2[,88]))
## integer(0)
# rnaseq <- rnaseq[-38532,]
rownames(rnaseq2) <- rnaseq2$genes
rnaseq2 <- rnaseq2[,-c(1,88)]
rnaseq2 <- t(rnaseq2)
which(colnames(rnaseq2)=="")
## [1] 1599
rnaseq2 <- rnaseq2[,-1599]
rnaseq2 <- as.data.frame(rnaseq2)
rnaseq2$row <- rownames(rnaseq2)
dim(rnaseq2)
## [1]    86 39475
rnaseq <- rbind(rnaseq1,rnaseq2)
rnaseq1 <- rnaseq[!duplicated(rnaseq$row),]
rnaseq1 <- rnaseq1[,-39475]
rnaseq1 <- rnaseq1[ order(row.names(rnaseq1)), ]

2.2 Clinic

  • clinical data from CRC patients provided by Hospital de Santa Maria (Lisbon)
DATASET1 <- read_excel("data/DATASET1NEW_illumina.xlsx")
DATASET1 <- DATASET1[ order(DATASET1$ID), ]
rownames(DATASET1) <- DATASET1$ID
## Warning: Setting row names on a tibble is deprecated.
rnaseq1 <- as.data.frame(rnaseq1[rownames(rnaseq1) %in% 
                         rownames(DATASET1),])

DATASET1 <- as.data.frame(DATASET1[rownames(DATASET1) %in% 
                         rownames(rnaseq1),])


clinic1 <- DATASET1[,-c(1,8,9)]
rownames(clinic1) <- DATASET1$ID
  • Divide data into 3 smaller datasets
set.seed(2010)
# imbalanced data
prop.table(table(DATASET1$class))
## 
##         P        Pm 
## 0.7272727 0.2727273
df_p <- DATASET1[which(DATASET1$class == "P"),]
df_pm <- DATASET1[which(DATASET1$class == "Pm"),]
### setting negative counts to be same as positive counts - so that the data is balanced
nsample <- params$nsample
pick_negative <- sample(df_p$ID, nsample)
df_p1f <- df_p[df_p$ID %in% pick_negative, ] 
df_p2 <- subset(df_p,!(ID %in% pick_negative))

nsample <- 25
pick_negative <- sample(df_p2$ID, nsample)
df_p2f <- df_p2[df_p2$ID %in% pick_negative, ] 
df_p3f <- subset(df_p2,!(ID %in% pick_negative))


df1 <- rbind(df_p1f,df_pm)
df2 <- rbind(df_p2f,df_pm)
df3 <- rbind(df_p3f,df_pm)

dim(df1)
## [1] 60  9
dim(df2)
## [1] 55  9
dim(df3)
## [1] 55  9
table(df1$class)
## 
##  P Pm 
## 30 30
table(df2$class)
## 
##  P Pm 
## 25 30
table(df3$class)
## 
##  P Pm 
## 25 30
clinic1_bal <- df1[,-c(1,8,9)]
datasurv1 <- as.data.frame(df1[,8:9])
rownames(clinic1_bal) <- df1$ID
rownames(datasurv1) <- df1$ID

clinic2_bal <- df2[,-c(1,8,9)]
datasurv2 <- as.data.frame(df2[,8:9])
rownames(clinic2_bal) <- df2$ID
rownames(datasurv2) <- df2$ID

clinic3_bal <- df3[,-c(1,8,9)]
datasurv3 <- as.data.frame(df3[,8:9])
rownames(clinic3_bal) <- df3$ID
rownames(datasurv3) <- df3$ID

3 Analysis

3.1 DATASET 1

3.1.1 DATA

  • Here we give the example for dataset 1 (to test other datasets replace x to the following dfx, datasetx_bal, and clinicx_bal) ps: don’t forget to change save.image(“~resultsx_2010.RData”) to save the data of each dataset
load("~/CRC_LCosta/results/results1_2010_final.RData")
DATASET1_bal <- df1
DATASET1 <- DATASET1_bal
clinic1 <- clinic1_bal
  • Split dataset in two groups for classification: – P -> non-metastatic (CLASS = 1) – PM -> metastatic (CLASS = 0)
# primM <- DATASET1 %>%
#   filter(str_detect(class, "m"))
# rownames(primM) <- primM$ID
# primN <- DATASET1 %>%
#   filter(!str_detect(class, "Pm"))
# rownames(primN) <- primN$ID
# 
# clinic_prim <- DATASET1
# 
# rnaprimM <- rnaseq1[rownames(rnaseq1) %in% 
#                          rownames(primM),]
# 
# rnaprimN <- rnaseq1[rownames(rnaseq1) %in% 
#                          rownames(primN),]
# 
# rnaprim <- rbind(rnaprimM,rnaprimN)
  • removing variables with sd=0
# xmet <- rnaprimM [,sapply(seq(ncol(rnaprimM)), function(ix) {sd(rnaprimM[,ix])}) != 0] 
# xnon <- rnaprimN[,sapply(seq(ncol(rnaprimN)), function(ix) {sd(rnaprimN[,ix])}) != 0] 
# 
# xmet_less <- xmet[,which(colnames(xmet) %in% colnames(xnon))]
# xnon_less <- xnon[,which(colnames(xnon) %in% colnames(xmet))]
# 
# # normalizing data
# xmet_norm <- scale(log2(xmet_less+1)) 
# xnon_norm <- scale(log2(xnon_less+1)) 
# 
# xdataT <- rbind(xmet_less,xnon_less)
# xdataT <- xdataT[, !sapply(xdataT, function(x) { sd(x) == 0} )]
# xdataT <- xdataT[ order(row.names(xdataT)), ]
# 
# 
# 
# rm(xmet,xmet_less,xnon,xnon_less,rnaprimM,rnaprimN)
  • weight vector that penalizes genes with greater distances between Pm and P correlation matrices - TWINER
# #xmet_cor <- Matrix(cor(xmet_norm), sparse = TRUE)
# xmet_cor <- cor(xmet_norm)
# #xmet_cor <- as.data.frame(xmet_cor)
# xnon_cor <- cor(xnon_norm)
# #xnon_cor <- as.data.frame(xnon_cor)
# 
# # angular distance
# ang_weight <- vector()
# for (i in 1:dim(xmet_cor)[2]){ 
# ang_weight[i] <- acos(cosine(xmet_cor[,i],xnon_cor[,i]))/pi
# }
# 
# ## normalized weights
# 
# weights <- ang_weight / max(ang_weight)
# hist(weights,main="w")
# 
# 
# pen_weight1 <- 1 / weights
# hist(pen_weight1, main="1 / w")
# 
# rm(xmet_cor,xnon_cor)

3.1.2 Exploratory analysis

  • All dataset x
clinical <- as.data.frame(clinic1_bal)


basic_eda <- function(clinical)
{
  glimpse(clinical)
  #df_Status(clinical)
  freq(clinical) 
  profiling_num(clinical)
  plot_num(clinical)
  describe(clinic1)
}
basic_eda(clinical)
## Rows: 60
## Columns: 6
## $ class     <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", …
## $ organ     <chr> "rectum", "colon", "rectum", "colon", "colon", "colon", "col…
## $ Sex       <chr> "m", "f", "m", "f", "f", "m", "f", "m", "m", "f", "f", "f", …
## $ Age       <dbl> 78, 74, 65, 79, 74, 52, 57, 63, 86, 69, 40, 65, 60, 65, 84, …
## $ Stage     <chr> "III", "II", "II", "III", "II", "III", "III", "II", "II", "I…
## $ sidedness <chr> "rectum", "right", "rectum", "left", "left", "left", "right"…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1     P        30         50              50
## 2    Pm        30         50             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        53      88.33           88.33
## 2 rectum         7      11.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        37      61.67           61.67
## 2   m        23      38.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1    II        32      53.33           53.33
## 2   III        28      46.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1      left        26      43.33           43.33
## 2     right        23      38.33           81.66
## 3    rectum         7      11.67           93.33
## 4      <NA>         4       6.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinic1 
## 
##  6  Variables      60  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct 
##       60        0        2 
##                   
## Value        P  Pm
## Frequency   30  30
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       60        0        2 
##                         
## Value       colon rectum
## Frequency      53      7
## Proportion  0.883  0.117
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       60        0        2 
##                       
## Value          f     m
## Frequency     37    23
## Proportion 0.617 0.383
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       47       13       27    0.998    68.55    14.37     48.5     54.8 
##      .25      .50      .75      .90      .95 
##     60.0     69.0     78.5     85.0     86.0 
## 
## lowest : 37 40 47 52 53, highest: 84 85 86 87 91
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       60        0        2 
##                       
## Value         II   III
## Frequency     32    28
## Proportion 0.533 0.467
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       56        4        3 
##                                
## Value        left rectum  right
## Frequency      26      7     23
## Proportion  0.464  0.125  0.411
## --------------------------------------------------------------------------------
a <- na.omit(clinical$Age)
mean(a)
## [1] 68.55319
  • Only patients that do not metastasize
clinical_p <- clinic1_bal %>%
  filter(!str_detect(class, "Pm"))


basic_eda <- function(clinical_p)
{
  glimpse(clinical_p)
  #df_Status(clinical_p)
  freq(clinical_p) 
  profiling_num(clinical_p)
  plot_num(clinical_p)
  describe(clinical_p)
}
basic_eda(clinical_p)
## Rows: 30
## Columns: 6
## $ class     <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", …
## $ organ     <chr> "rectum", "colon", "rectum", "colon", "colon", "colon", "col…
## $ Sex       <chr> "m", "f", "m", "f", "f", "m", "f", "m", "m", "f", "f", "f", …
## $ Age       <dbl> 78, 74, 65, 79, 74, 52, 57, 63, 86, 69, 40, 65, 60, 65, 84, …
## $ Stage     <chr> "III", "II", "II", "III", "II", "III", "III", "II", "II", "I…
## $ sidedness <chr> "rectum", "right", "rectum", "left", "left", "left", "right"…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1     P        30        100             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        28      93.33           93.33
## 2 rectum         2       6.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        20      66.67           66.67
## 2   m        10      33.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1    II        22      73.33           73.33
## 2   III         8      26.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1     right        14      46.67           46.67
## 2      left        13      43.33           90.00
## 3    rectum         2       6.67           96.67
## 4      <NA>         1       3.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinical_p 
## 
##  6  Variables      30  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct    value 
##       30        0        1        P 
##              
## Value       P
## Frequency  30
## Proportion  1
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       30        0        2 
##                         
## Value       colon rectum
## Frequency      28      2
## Proportion  0.933  0.067
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       30        0        2 
##                       
## Value          f     m
## Frequency     20    10
## Proportion 0.667 0.333
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       21        9       15    0.995    68.29    15.46       47       52 
##      .25      .50      .75      .90      .95 
##       60       65       79       84       86 
## 
## lowest : 40 47 52 57 60, highest: 79 83 84 86 91
##                                                                             
## Value         40    47    52    57    60    63    65    69    74    78    79
## Frequency      1     1     1     1     2     2     3     1     2     1     2
## Proportion 0.048 0.048 0.048 0.048 0.095 0.095 0.143 0.048 0.095 0.048 0.095
##                                   
## Value         83    84    86    91
## Frequency      1     1     1     1
## Proportion 0.048 0.048 0.048 0.048
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       30        0        2 
##                       
## Value         II   III
## Frequency     22     8
## Proportion 0.733 0.267
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       29        1        3 
##                                
## Value        left rectum  right
## Frequency      13      2     14
## Proportion  0.448  0.069  0.483
## --------------------------------------------------------------------------------
a <- na.omit(clinical_p$Age)
mean(a)
## [1] 68.28571
  • Only patients that do metastasize
clinical_Pm <- clinic1_bal%>%
  filter(str_detect(class, "Pm"))


basic_eda <- function(clinical_Pm)
{
  glimpse(clinical_Pm)
  #df_Status(clinical_Pm)
  freq(clinical_Pm) 
  profiling_num(clinical_Pm)
  plot_num(clinical_Pm)
  describe(clinical_Pm)
}
basic_eda(clinical_Pm)
## Rows: 30
## Columns: 6
## $ class     <chr> "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", …
## $ organ     <chr> "rectum", "colon", "rectum", "colon", "colon", "colon", "rec…
## $ Sex       <chr> "f", "f", "f", "m", "f", "f", "m", "f", "f", "m", "m", "f", …
## $ Age       <dbl> 58, 85, 62, 67, 75, 69, 58, 57, 72, 65, 61, 78, 81, 57, 74, …
## $ Stage     <chr> "II", "II", "III", "III", "III", "II", "II", "III", "II", "I…
## $ sidedness <chr> "left", "right", "rectum", "rectum", "left", "left", "rectum…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1    Pm        30        100             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        25      83.33           83.33
## 2 rectum         5      16.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        17      56.67           56.67
## 2   m        13      43.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1   III        20      66.67           66.67
## 2    II        10      33.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1      left        13      43.33           43.33
## 2     right         9      30.00           73.33
## 3    rectum         5      16.67           90.00
## 4      <NA>         3      10.00          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinical_Pm 
## 
##  6  Variables      30  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct    value 
##       30        0        1       Pm 
##              
## Value      Pm
## Frequency  30
## Proportion  1
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       30        0        2 
##                         
## Value       colon rectum
## Frequency      25      5
## Proportion  0.833  0.167
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       30        0        2 
##                       
## Value          f     m
## Frequency     17    13
## Proportion 0.567 0.433
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        4       18    0.997    68.77    13.91    53.75    56.50 
##      .25      .50      .75      .90      .95 
##    58.75    69.00    77.25    85.00    85.75 
## 
## lowest : 37 53 56 57 58, highest: 78 81 85 86 87
##                                                                             
## Value         37    53    56    57    58    61    62    65    67    69    72
## Frequency      1     1     1     2     2     1     1     1     1     3     2
## Proportion 0.038 0.038 0.038 0.077 0.077 0.038 0.038 0.038 0.038 0.115 0.077
##                                                     
## Value         74    75    78    81    85    86    87
## Frequency      2     1     1     2     2     1     1
## Proportion 0.077 0.038 0.038 0.077 0.077 0.038 0.038
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       30        0        2 
##                       
## Value         II   III
## Frequency     10    20
## Proportion 0.333 0.667
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       27        3        3 
##                                
## Value        left rectum  right
## Frequency      13      5      9
## Proportion  0.481  0.185  0.333
## --------------------------------------------------------------------------------
a <- na.omit(clinical_Pm$Age)
mean(a)
## [1] 68.76923
  • Statistic differences between patients groups regarding several variables
clinical_factor <- clinic1_bal
clinical_factor <- clinical_factor %>%
  mutate_if(sapply(clinical_factor, is.character), as.factor)
#clinical_factor <- na.omit(clinical_factor)

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  organ), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  Sex), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

clinical_factor1 <- clinical_factor %>%
  filter(!str_detect(sidedness, "rectum"))

ggplot(data = clinical_factor1) + 
  geom_bar(mapping = aes(x = class, fill =  sidedness), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  Stage), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4", "#000080"))

clinical_factor %>%
  ggplot( aes(x=Age, fill=class)) +
    geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
    scale_fill_manual(values=c("#69b3a2", "#404080")) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bin).

    labs(fill="")
## $fill
## [1] ""
## 
## attr(,"class")
## [1] "labels"
stat_data_organ <- table(clinical_factor$class,clinical_factor$organ)
#fazer plot(...)
fisher.test(stat_data_organ)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_organ
## p-value = 0.4238
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.4061343 31.3554016
## sample estimates:
## odds ratio 
##   2.754099
stat_data_sex <- table(clinical_factor$class,clinical_factor$Sex)
fisher.test(stat_data_sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_sex
## p-value = 0.5959
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4748001 4.9821718
## sample estimates:
## odds ratio 
##   1.518518
stat_data_stage <- table(clinical_factor$class,clinical_factor$Stage)
fisher.test(stat_data_stage)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_stage
## p-value = 0.004025
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   1.603346 19.445739
## sample estimates:
## odds ratio 
##   5.327244
stat_data_side <- table(clinical_factor$class,clinical_factor$sidedness)
fisher.test(stat_data_side)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_side
## p-value = 0.3185
## alternative hypothesis: two.sided
hist(clinical_factor$Age[clinical_factor$class=="P"])

hist(clinical_factor$Age[clinical_factor$class=="Pm"])

tapply(clinical_factor$Age,clinical_factor$class, summary)
## $P
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   40.00   60.00   65.00   68.29   79.00   91.00       9 
## 
## $Pm
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   37.00   58.75   69.00   68.77   77.25   87.00       4
t.test(Age ~ class, clinical_factor)
## 
##  Welch Two Sample t-test
## 
## data:  Age by class
## t = -0.12846, df = 41.11, p-value = 0.8984
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.084572  7.117539
## sample estimates:
##  mean in group P mean in group Pm 
##         68.28571         68.76923

3.1.3 Survival analysis

3.1.3.1 Stage

data <- merge(datasurv1, clinical, by="row.names")

fit <- survfit(Surv(time, Status) ~ Stage, data = data)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ Stage, data = data)
## 
##            n events median 0.95LCL 0.95UCL
## Stage=II  32      7     NA    1635      NA
## Stage=III 28     17   1157     797      NA
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ Stage, data = data)
## 
##                 Stage=II 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     32       1    0.969  0.0308        0.910        1.000
##   400     30       1    0.936  0.0435        0.855        1.000
##  1357     16       1    0.878  0.0698        0.751        1.000
##  1428     14       1    0.815  0.0886        0.659        1.000
##  1461     13       1    0.753  0.1016        0.578        0.980
##  1635     11       1    0.684  0.1131        0.495        0.946
##  2011     10       1    0.616  0.1207        0.419        0.904
## 
##                 Stage=III 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   216     28       1    0.964  0.0351        0.898        1.000
##   386     27       1    0.929  0.0487        0.838        1.000
##   420     26       1    0.893  0.0585        0.785        1.000
##   500     25       1    0.857  0.0661        0.737        0.997
##   518     24       1    0.821  0.0724        0.691        0.976
##   520     23       1    0.786  0.0775        0.648        0.953
##   573     22       1    0.750  0.0818        0.606        0.929
##   632     21       1    0.714  0.0854        0.565        0.903
##   682     20       1    0.679  0.0883        0.526        0.876
##   797     18       1    0.641  0.0911        0.485        0.847
##   807     17       1    0.603  0.0932        0.446        0.816
##   857     16       1    0.565  0.0947        0.407        0.785
##  1107     14       1    0.525  0.0961        0.367        0.752
##  1157     13       1    0.485  0.0969        0.328        0.717
##  1682     10       1    0.436  0.0986        0.280        0.679
##  1835      9       1    0.388  0.0988        0.235        0.639
##  3667      4       1    0.291  0.1120        0.137        0.619
# Access to the sort summary table
summary(fit)$table
##           records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## Stage=II       32    32      32      7 3111.160   332.7298     NA    1635
## Stage=III      28    28      28     17 2088.959   321.2685   1157     797
##           0.95UCL
## Stage=II       NA
## Stage=III      NA
ggsurvplot(fit, data = data, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ Stage, data = data)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ Stage, data = data)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## Stage=II  32        7     13.3      3.01       6.8
## Stage=III 28       17     10.7      3.76       6.8
## 
##  Chisq= 6.8  on 1 degrees of freedom, p= 0.009

3.1.3.2 Class - PM vs. P

fit <- survfit(Surv(time, Status) ~ class, data = data)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ class, data = data)
## 
##           n events median 0.95LCL 0.95UCL
## class=P  30      2     NA    3667      NA
## class=Pm 30     22   1107     682    2011
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ class, data = data)
## 
##                 class=P 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##  1461     13       1    0.923  0.0739        0.789            1
##  3667      5       1    0.738  0.1754        0.464            1
## 
##                 class=Pm 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     30       1    0.967  0.0328        0.905        1.000
##   216     29       1    0.933  0.0455        0.848        1.000
##   386     28       1    0.900  0.0548        0.799        1.000
##   400     27       1    0.867  0.0621        0.753        0.997
##   420     26       1    0.833  0.0680        0.710        0.978
##   500     25       1    0.800  0.0730        0.669        0.957
##   518     24       1    0.767  0.0772        0.629        0.934
##   520     23       1    0.733  0.0807        0.591        0.910
##   573     22       1    0.700  0.0837        0.554        0.885
##   632     21       1    0.667  0.0861        0.518        0.859
##   682     20       1    0.633  0.0880        0.482        0.832
##   797     19       1    0.600  0.0894        0.448        0.804
##   807     18       1    0.567  0.0905        0.414        0.775
##   857     17       1    0.533  0.0911        0.382        0.745
##  1107     15       1    0.498  0.0917        0.347        0.714
##  1157     14       1    0.462  0.0918        0.313        0.682
##  1357     13       1    0.427  0.0913        0.280        0.649
##  1428     12       1    0.391  0.0904        0.249        0.615
##  1635     11       1    0.356  0.0889        0.218        0.580
##  1682     10       1    0.320  0.0868        0.188        0.545
##  1835      9       1    0.284  0.0841        0.159        0.508
##  2011      7       1    0.244  0.0813        0.127        0.469
# Access to the sort summary table
summary(fit)$table
##          records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## class=P       30    30      30      2 3887.215   212.9214     NA    3667
## class=Pm      30    30      30     22 1716.567   276.5550   1107     682
##          0.95UCL
## class=P       NA
## class=Pm    2011
ggsurvplot(fit, data = data, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ class, data = data)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ class, data = data)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## class=P  30        2       13      9.32      20.5
## class=Pm 30       22       11     11.05      20.5
## 
##  Chisq= 20.5  on 1 degrees of freedom, p= 6e-06

3.1.3.3 Sidedness

newdata <- data[-which(data$sidedness == "rectum"),]
fit <- survfit(Surv(time, Status) ~ sidedness, data = newdata)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
##    4 observations deleted due to missingness 
##                  n events median 0.95LCL 0.95UCL
## sidedness=left  26      8     NA    1835      NA
## sidedness=right 23      9   1635    1461      NA
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
## 4 observations deleted due to missingness 
##                 sidedness=left 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   400     25       1    0.960  0.0392        0.886        1.000
##   520     24       1    0.920  0.0543        0.820        1.000
##   632     23       1    0.880  0.0650        0.761        1.000
##   682     22       1    0.840  0.0733        0.708        0.997
##   807     20       1    0.798  0.0808        0.654        0.973
##  1107     15       1    0.745  0.0913        0.586        0.947
##  1428     13       1    0.688  0.1006        0.516        0.916
##  1835     10       1    0.619  0.1116        0.434        0.881
## 
##                 sidedness=right 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     23       1    0.957  0.0425        0.877        1.000
##   420     22       1    0.913  0.0588        0.805        1.000
##   500     21       1    0.870  0.0702        0.742        1.000
##   797     16       1    0.815  0.0843        0.666        0.998
##   857     15       1    0.761  0.0946        0.596        0.971
##  1157     12       1    0.697  0.1058        0.518        0.939
##  1461      7       1    0.598  0.1294        0.391        0.914
##  1635      6       1    0.498  0.1411        0.286        0.868
##  2011      5       1    0.399  0.1438        0.197        0.808
# Access to the sort summary table
summary(fit)$table
##                 records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## sidedness=left       26    26      26      8 2985.803   345.9387     NA    1835
## sidedness=right      23    23      23      9 2391.543   414.8524   1635    1461
##                 0.95UCL
## sidedness=left       NA
## sidedness=right      NA
ggsurvplot(fit, data = newdata, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ sidedness, data = newdata)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
## n=49, 4 observations deleted due to missingness.
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## sidedness=left  26        8      9.7     0.299     0.702
## sidedness=right 23        9      7.3     0.398     0.702
## 
##  Chisq= 0.7  on 1 degrees of freedom, p= 0.4

3.1.4 DEGs

xdata <- rnaseq1[ order(row.names(rnaseq1)), ]
rownames(DATASET1_bal) <- DATASET1_bal$ID
ydata <- as.data.frame(DATASET1_bal[ order(row.names(DATASET1_bal)), ])
rownames(ydata) <- ydata$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata),]
ydata <- as.data.frame(ydata[rownames(ydata) %in% 
                         rownames(xdata),])

# keep features with standard deviation > 0
xdata <- xdata[,sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])}) != 0]
xdata <- t(xdata)

group <- as.factor(ydata$class)
class <- as.data.frame(ydata$class)
edgeR.DGElist <- DGEList(counts=xdata, group = group)

# remove genes that do not have one count per million in at least 5 samples
keep <- rowSums(cpm(edgeR.DGElist) >= 1) >= 5
edgeR.DGElist <- edgeR.DGElist[keep ,]

# specify the design setup 

design <- model.matrix(~group)

# estimate the dispersion for all read counts across all samples
edgeR.DGElist <- estimateDisp(edgeR.DGElist, design)

# fit the negative binomial model
edger_fit <- glmFit(edgeR.DGElist, design )

# perform the testing for every gene using the neg. binomial model
edger_lrt <- glmLRT(edger_fit)
summary(decideTests(edger_lrt))
##        groupPm
## Down      7944
## NotSig   10916
## Up        1589
# extract results from edger _lrt$ table plus adjusted p- values
DGE.results_edgeR <- topTags(edger_lrt, n = Inf , sort.by = "PValue" , adjust.method = "BH" )

topTags(DGE.results_edgeR) #table with the top10 DEGs
## Coefficient:  groupPm 
##               logFC     logCPM       LR       PValue          FDR
## LRP4      -2.569312  4.9195654 38.64129 5.093015e-10 1.041471e-05
## AXIN2     -1.932022  6.2429524 35.94787 2.026676e-09 1.847888e-05
## MIR3197    4.046824 -2.1741728 35.38116 2.710970e-09 1.847888e-05
## TFF2       3.209515  2.0356193 34.71053 3.825576e-09 1.955730e-05
## RNU7-77P   3.584652 -2.1579934 33.58504 6.821530e-09 2.339502e-05
## GBP4      -2.447218  4.2258158 33.23732 8.157008e-09 2.339502e-05
## RNU6-83P   3.279494 -0.6616794 33.10829 8.716611e-09 2.339502e-05
## IDO1      -3.193236  2.8868167 33.01341 9.152535e-09 2.339502e-05
## RNU6-769P  3.525962 -1.7899718 31.68738 1.810956e-08 3.760618e-05
## LGR6      -1.911118  2.0345325 31.65751 1.839023e-08 3.760618e-05
genes_deg <- DGE.results_edgeR$table
genes_deg <- genes_deg[which(genes_deg$FDR < 0.05),]
dim(genes_deg)#genes found to be differentially expressed
## [1] 9533    5
genes_deg$row <- row.names(genes_deg)
write_xlsx(genes_deg,"genes_deg_d1.xlsx")

top100_deg <- rownames(genes_deg[1:100,])
# highly expressed genes
high <- genes_deg[genes_deg$logFC > 0,]
dim(high)
## [1] 1589    6
top100_high <- rownames(high[1:100,])

#low expressed genes
low <- genes_deg[genes_deg$logFC < 0,]
dim(low)
## [1] 7944    6
top100_low <- rownames(low[1:100,])

3.1.5 Classification

  • In this work we tested 3 different types of classification: 1) Classification without regularization based on DEGs; 2) Classification with regularization (EN and iTwiner); 3) Classification without regularization based on genes selected by regularized logistic regression;
xdata.raw <- xdataT

# keep features with standard deviation > 0
xdata <- xdata.raw[,sapply(seq(ncol(xdata.raw)), function(ix) {sd(xdata.raw[,ix])}) != 0]

ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

# ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
ydata$`clinic1$class` <- as.numeric(ydata$`clinic1$class`)

3.1.5.1 1) Classification without regularization based on DEGs

Five classifiers were used: Decision trees, linear and radial support vector machines, logistic regression and random forest

  • Lets pick up the 50 deferentially expressed genes with lowest pvalue found above
xdata <- xdataT[,top100_deg[1:50]]

nomesgenes <- colnames(xdata)
colnames(xdata) <- paste0("Var", 1:50)
colnames(ydata) <- c("class","row")
xdata$type <- as.factor(ydata$class)

#xdata <- xdata[colMeans(xdata == 0) <= 0.6] #delete genes that have null values in at least 60% of the samples
  • Test the different classifiers 100 times to obtain median values for the measures of model performance such as accuracy, misclassification, sensitivity and specificity, among others
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.1.5.1.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_train[1,]
hist(acc_trees)

mean(acc_trees)
## [1] 0.9940476
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01090039
acc_svm <- acc_train[2,]
hist(acc_svm)

mean(acc_svm)
## [1] 0.9495238
median(acc_svm)
## [1] 0.952381
sd(acc_svm)
## [1] 0.03106779
acc_svmR <- acc_train[3,]
hist(acc_svmR)

mean(acc_svmR)
## [1] 0.8845238
median(acc_svmR)
## [1] 0.9047619
sd(acc_svmR)
## [1] 0.09880148
acc_logs <- acc_train[4,]
hist(acc_logs)

mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_train[5,]
hist(acc_rf)

mean(acc_rf)
## [1] 1
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0
  • auc
# auc
auc_trees <- auc_train[1,]
mean(auc_trees)
## [1] 0.9940476
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01090039
auc_svm <- auc_train[2,]
mean(auc_svm)
## [1] 0.9495238
median(auc_svm)
## [1] 0.952381
sd(auc_svm)
## [1] 0.03106779
auc_svmR <- auc_train[3,]
mean(auc_svmR)
## [1] 0.8845238
median(auc_svmR)
## [1] 0.9047619
sd(auc_svmR)
## [1] 0.09880148
auc_logs <- auc_train[4,]
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_train[5,]
mean(auc_rf)
## [1] 1
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0
  • Misclassification
# miscl
miscl_trees <- miscl_train[1,]
mean(miscl_trees)
## [1] 0.25
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.4578165
miscl_svm <- miscl_train[2,]
mean(miscl_svm)
## [1] 2.12
median(miscl_svm)
## [1] 2
sd(miscl_svm)
## [1] 1.304847
miscl_svmR <- miscl_train[3,]
mean(miscl_svmR)
## [1] 4.85
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 4.149662
miscl_logs <- miscl_train[4,]
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_train[5,]
mean(miscl_rf)
## [1] 0
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_train[1,]
mean(sensitivity_trees)
## [1] 0.9952381
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.01435768
sensitivity_svm <- sensitivity_train[2,]
mean(sensitivity_svm)
## [1] 0.9314286
median(sensitivity_svm)
## [1] 0.9761905
sd(sensitivity_svm)
## [1] 0.0790238
sensitivity_svmR <- sensitivity_train[3,]
mean(sensitivity_svmR)
## [1] 0.8514286
median(sensitivity_svmR)
## [1] 0.8095238
sd(sensitivity_svmR)
## [1] 0.1264926
sensitivity_logs <- sensitivity_train[4,]
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_train[5,]
mean(sensitivity_rf)
## [1] 1
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0
  • Specificity
# specificity
specificity_trees <- specificity_train[1,]
mean(specificity_trees)
## [1] 0.9928571
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.01708906
specificity_svm <- specificity_train[2,]
mean(specificity_svm)
## [1] 0.967619
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.04380074
specificity_svmR <- specificity_train[3,]
mean(specificity_svmR)
## [1] 0.917619
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.1126171
specificity_logs <- specificity_train[4,]
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_train[5,]
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_train[1,]
mean(fneg_trees)
## [1] 0.1
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.3015113
fneg_svm <- fneg_train[2,]
mean(fneg_svm)
## [1] 1.44
median(fneg_svm)
## [1] 0.5
sd(fneg_svm)
## [1] 1.6595
fneg_svmR <- fneg_train[3,]
mean(fneg_svmR)
## [1] 3.12
median(fneg_svmR)
## [1] 4
sd(fneg_svmR)
## [1] 2.656344
fneg_logs <- fneg_train[4,]
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_train[5,]
mean(fneg_rf)
## [1] 0
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0
3.1.5.1.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc[1,]
hist(acc_trees)

mean(acc_trees)
## [1] 0.6333333
median(acc_trees)
## [1] 0.6111111
sd(acc_trees)
## [1] 0.09799079
acc_svm <- acc[2,]
hist(acc_svm)

mean(acc_svm)
## [1] 0.6861111
median(acc_svm)
## [1] 0.6666667
sd(acc_svm)
## [1] 0.1024622
acc_svmR <- acc[3,]
hist(acc_svmR)

mean(acc_svmR)
## [1] 0.6416667
median(acc_svmR)
## [1] 0.6666667
sd(acc_svmR)
## [1] 0.1009294
acc_logs <- acc[4,]
hist(acc_logs)

mean(acc_logs)
## [1] 0.6883333
median(acc_logs)
## [1] 0.6666667
sd(acc_logs)
## [1] 0.09238866
acc_rf <- acc[5,]
hist(acc_rf)

mean(acc_rf)
## [1] 0.7444444
median(acc_rf)
## [1] 0.7222222
sd(acc_rf)
## [1] 0.08898704
  • auc
# auc
auc_trees <- auc[1,]
mean(auc_trees)
## [1] 0.6377778
median(auc_trees)
## [1] 0.6111111
sd(auc_trees)
## [1] 0.09157011
auc_svm <- auc[2,]
mean(auc_svm)
## [1] 0.6861111
median(auc_svm)
## [1] 0.6666667
sd(auc_svm)
## [1] 0.1024622
auc_svmR <- auc[3,]
mean(auc_svmR)
## [1] 0.6461111
median(auc_svmR)
## [1] 0.6666667
sd(auc_svmR)
## [1] 0.09431231
auc_logs <- auc[4,]
mean(auc_logs)
## [1] 0.6883333
median(auc_logs)
## [1] 0.6666667
sd(auc_logs)
## [1] 0.09238866
auc_rf <- auc[5,]
mean(auc_rf)
## [1] 0.7444444
median(auc_rf)
## [1] 0.7222222
sd(auc_rf)
## [1] 0.08898704
  • Misclassification
# miscl
miscl_trees <- miscl[1,]
mean(miscl_trees)
## [1] 6.6
median(miscl_trees)
## [1] 7
sd(miscl_trees)
## [1] 1.763834
miscl_svm <- miscl[2,]
mean(miscl_svm)
## [1] 5.65
median(miscl_svm)
## [1] 6
sd(miscl_svm)
## [1] 1.84432
miscl_svmR <- miscl[3,]
mean(miscl_svmR)
## [1] 6.45
median(miscl_svmR)
## [1] 6
sd(miscl_svmR)
## [1] 1.816729
miscl_logs <- miscl[4,]
mean(miscl_logs)
## [1] 5.61
median(miscl_logs)
## [1] 6
sd(miscl_logs)
## [1] 1.662996
miscl_rf <- miscl[5,]
mean(miscl_rf)
## [1] 4.6
median(miscl_rf)
## [1] 5
sd(miscl_rf)
## [1] 1.601767
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity[1,]
mean(sensitivity_trees)
## [1] 0.6744444
median(sensitivity_trees)
## [1] 0.6666667
sd(sensitivity_trees)
## [1] 0.1627907
sensitivity_svm <- sensitivity[2,]
mean(sensitivity_svm)
## [1] 0.6488889
median(sensitivity_svm)
## [1] 0.6666667
sd(sensitivity_svm)
## [1] 0.1756608
sensitivity_svmR <- sensitivity[3,]
mean(sensitivity_svmR)
## [1] 0.7088889
median(sensitivity_svmR)
## [1] 0.6666667
sd(sensitivity_svmR)
## [1] 0.1846997
sensitivity_logs <- sensitivity[4,]
mean(sensitivity_logs)
## [1] 0.6866667
median(sensitivity_logs)
## [1] 0.6666667
sd(sensitivity_logs)
## [1] 0.1424668
sensitivity_rf <- sensitivity[5,]
mean(sensitivity_rf)
## [1] 0.7255556
median(sensitivity_rf)
## [1] 0.7222222
sd(sensitivity_rf)
## [1] 0.1258021
  • Specificity
# specificity
specificity_trees <- specificity[1,]
mean(specificity_trees)
## [1] 0.5922222
median(specificity_trees)
## [1] 0.5555556
sd(specificity_trees)
## [1] 0.1671747
specificity_svm <- specificity[2,]
mean(specificity_svm)
## [1] 0.7233333
median(specificity_svm)
## [1] 0.7777778
sd(specificity_svm)
## [1] 0.2092123
specificity_svmR <- specificity[3,]
mean(specificity_svmR)
## [1] 0.5744444
median(specificity_svmR)
## [1] 0.5555556
sd(specificity_svmR)
## [1] 0.198545
specificity_logs <- specificity[4,]
mean(specificity_logs)
## [1] 0.69
median(specificity_logs)
## [1] 0.6666667
sd(specificity_logs)
## [1] 0.1628213
specificity_rf <- specificity[5,]
mean(specificity_rf)
## [1] 0.7633333
median(specificity_rf)
## [1] 0.7777778
sd(specificity_rf)
## [1] 0.1400613
  • False Neg
# fneg
fneg_trees <- fneg[1,]
mean(fneg_trees)
## [1] 2.93
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.465116
fneg_svm <- fneg[2,]
mean(fneg_svm)
## [1] 3.16
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 1.580947
fneg_svmR <- fneg[3,]
mean(fneg_svmR)
## [1] 2.62
median(fneg_svmR)
## [1] 3
sd(fneg_svmR)
## [1] 1.662297
fneg_logs <- fneg[4,]
mean(fneg_logs)
## [1] 2.82
median(fneg_logs)
## [1] 3
sd(fneg_logs)
## [1] 1.282201
fneg_rf <- fneg[5,]
mean(fneg_rf)
## [1] 2.47
median(fneg_rf)
## [1] 2.5
sd(fneg_rf)
## [1] 1.132219

3.1.5.2 2) Classification with regularization (EN and iTwiner)

xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID
# 
# # ydata.raw$class <- c(rep(0,28),rep(1,34))
#  
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw)
ydata$`clinic1$class` <- as.numeric(ydata$`clinic1$class`)
## [1] 0
## [1] 1
## [1] 1011
## [1] "data"
## [1] "EN"
## [1] 1
## [1] 2
## [1] 1022
## [1] "data"
## [1] "EN"
## [1] 2
## [1] 3
## [1] 1033
## [1] "data"
## [1] "EN"
## [1] 3
## [1] 4
## [1] 1044
## [1] "data"
## [1] "EN"
## [1] 4
## [1] 5
## [1] 1055
## [1] "data"
## [1] "EN"
## [1] 5
## [1] 6
## [1] 1066
## [1] "data"
## [1] "EN"
## [1] 6
## [1] 7
## [1] 1077
## [1] "data"
## [1] "EN"
## [1] 7
## [1] 8
## [1] 1088
## [1] "data"
## [1] "EN"
## [1] 8
## [1] 9
## [1] 1099
## [1] "data"
## [1] "EN"
## [1] 9
## [1] 10
## [1] 1110
## [1] "data"
## [1] "EN"
## [1] 10
## [1] 11
## [1] 1121
## [1] "data"
## [1] "EN"
## [1] 11
## [1] 12
## [1] 1132
## [1] "data"
## [1] "EN"
## [1] 12
## [1] 13
## [1] 1143
## [1] "data"
## [1] "EN"
## [1] 13
## [1] 14
## [1] 1154
## [1] "data"
## [1] "EN"
## [1] 14
## [1] 15
## [1] 1165
## [1] "data"
## [1] "EN"
## [1] 15
## [1] 16
## [1] 1176
## [1] "data"
## [1] "EN"
## [1] 16
## [1] 17
## [1] 1187
## [1] "data"
## [1] "EN"
## [1] 17
## [1] 18
## [1] 1198
## [1] "data"
## [1] "EN"
## [1] 18
## [1] 19
## [1] 1209
## [1] "data"
## [1] "EN"
## [1] 19
## [1] 20
## [1] 1220
## [1] "data"
## [1] "EN"
## [1] 20
## [1] 21
## [1] 1231
## [1] "data"
## [1] "EN"
## [1] 21
## [1] 22
## [1] 1242
## [1] "data"
## [1] "EN"
## [1] 22
## [1] 23
## [1] 1253
## [1] "data"
## [1] "EN"
## [1] 23
## [1] 24
## [1] 1264
## [1] "data"
## [1] "EN"
## [1] 24
## [1] 25
## [1] 1275
## [1] "data"
## [1] "EN"
## [1] 25
## [1] 26
## [1] 1286
## [1] "data"
## [1] "EN"
## [1] 26
## [1] 27
## [1] 1297
## [1] "data"
## [1] "EN"
## [1] 27
## [1] 28
## [1] 1308
## [1] "data"
## [1] "EN"
## [1] 28
## [1] 29
## [1] 1319
## [1] "data"
## [1] "EN"
## [1] 29
## [1] 30
## [1] 1330
## [1] "data"
## [1] "EN"
## [1] 30
## [1] 31
## [1] 1341
## [1] "data"
## [1] "EN"
## [1] 31
## [1] 32
## [1] 1352
## [1] "data"
## [1] "EN"
## [1] 32
## [1] 33
## [1] 1363
## [1] "data"
## [1] "EN"
## [1] 33
## [1] 34
## [1] 1374
## [1] "data"
## [1] "EN"
## [1] 34
## [1] 35
## [1] 1385
## [1] "data"
## [1] "EN"
## [1] 35
## [1] 36
## [1] 1396
## [1] "data"
## [1] "EN"
## [1] 36
## [1] 37
## [1] 1407
## [1] "data"
## [1] "EN"
## [1] 37
## [1] 38
## [1] 1418
## [1] "data"
## [1] "EN"
## [1] 38
## [1] 39
## [1] 1429
## [1] "data"
## [1] "EN"
## [1] 39
## [1] 40
## [1] 1440
## [1] "data"
## [1] "EN"
## [1] 40
## [1] 41
## [1] 1451
## [1] "data"
## [1] "EN"
## [1] 41
## [1] 42
## [1] 1462
## [1] "data"
## [1] "EN"
## [1] 42
## [1] 43
## [1] 1473
## [1] "data"
## [1] "EN"
## [1] 43
## [1] 44
## [1] 1484
## [1] "data"
## [1] "EN"
## [1] 44
## [1] 45
## [1] 1495
## [1] "data"
## [1] "EN"
## [1] 45
## [1] 46
## [1] 1506
## [1] "data"
## [1] "EN"
## [1] 46
## [1] 47
## [1] 1517
## [1] "data"
## [1] "EN"
## [1] 47
## [1] 48
## [1] 1528
## [1] "data"
## [1] "EN"
## [1] 48
## [1] 49
## [1] 1539
## [1] "data"
## [1] "EN"
## [1] 49
## [1] 50
## [1] 1550
## [1] "data"
## [1] "EN"
## [1] 50
## [1] 51
## [1] 1561
## [1] "data"
## [1] "EN"
## [1] 51
## [1] 52
## [1] 1572
## [1] "data"
## [1] "EN"
## [1] 52
## [1] 53
## [1] 1583
## [1] "data"
## [1] "EN"
## [1] 53
## [1] 54
## [1] 1594
## [1] "data"
## [1] "EN"
## [1] 54
## [1] 55
## [1] 1605
## [1] "data"
## [1] "EN"
## [1] 55
## [1] 56
## [1] 1616
## [1] "data"
## [1] "EN"
## [1] 56
## [1] 57
## [1] 1627
## [1] "data"
## [1] "EN"
## [1] 57
## [1] 58
## [1] 1638
## [1] "data"
## [1] "EN"
## [1] 58
## [1] 59
## [1] 1649
## [1] "data"
## [1] "EN"
## [1] 59
## [1] 60
## [1] 1660
## [1] "data"
## [1] "EN"
## [1] 60
## [1] 61
## [1] 1671
## [1] "data"
## [1] "EN"
## [1] 61
## [1] 62
## [1] 1682
## [1] "data"
## [1] "EN"
## [1] 62
## [1] 63
## [1] 1693
## [1] "data"
## [1] "EN"
## [1] 63
## [1] 64
## [1] 1704
## [1] "data"
## [1] "EN"
## [1] 64
## [1] 65
## [1] 1715
## [1] "data"
## [1] "EN"
## [1] 65
## [1] 66
## [1] 1726
## [1] "data"
## [1] "EN"
## [1] 66
## [1] 67
## [1] 1737
## [1] "data"
## [1] "EN"
## [1] 67
## [1] 68
## [1] 1748
## [1] "data"
## [1] "EN"
## [1] 68
## [1] 69
## [1] 1759
## [1] "data"
## [1] "EN"
## [1] 69
## [1] 70
## [1] 1770
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 69
## [1] 71
## [1] 1781
## [1] "data"
## [1] "EN"
## [1] 70
## [1] 72
## [1] 1792
## [1] "data"
## [1] "EN"
## [1] 71
## [1] 73
## [1] 1803
## [1] "data"
## [1] "EN"
## [1] 72
## [1] 74
## [1] 1814
## [1] "data"
## [1] "EN"
## [1] 73
## [1] 75
## [1] 1825
## [1] "data"
## [1] "EN"
## [1] 74
## [1] 76
## [1] 1836
## [1] "data"
## [1] "EN"
## [1] 75
## [1] 77
## [1] 1847
## [1] "data"
## [1] "EN"
## [1] 76
## [1] 78
## [1] 1858
## [1] "data"
## [1] "EN"
## [1] 77
## [1] 79
## [1] 1869
## [1] "data"
## [1] "EN"
## [1] 78
## [1] 80
## [1] 1880
## [1] "data"
## [1] "EN"
## [1] 79
## [1] 81
## [1] 1891
## [1] "data"
## [1] "EN"
## [1] 80
## [1] 82
## [1] 1902
## [1] "data"
## [1] "EN"
## [1] 81
## [1] 83
## [1] 1913
## [1] "data"
## [1] "EN"
## [1] 82
## [1] 84
## [1] 1924
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 82
## [1] 85
## [1] 1935
## [1] "data"
## [1] "EN"
## [1] 83
## [1] 86
## [1] 1946
## [1] "data"
## [1] "EN"
## [1] 84
## [1] 87
## [1] 1957
## [1] "data"
## [1] "EN"
## [1] 85
## [1] 88
## [1] 1968
## [1] "data"
## [1] "EN"
## [1] 86
## [1] 89
## [1] 1979
## [1] "data"
## [1] "EN"
## [1] 87
## [1] 90
## [1] 1990
## [1] "data"
## [1] "EN"
## [1] 88
## [1] 91
## [1] 2001
## [1] "data"
## [1] "EN"
## [1] 89
## [1] 92
## [1] 2012
## [1] "data"
## [1] "EN"
## [1] 90
## [1] 93
## [1] 2023
## [1] "data"
## [1] "EN"
## [1] 91
## [1] 94
## [1] 2034
## [1] "data"
## [1] "EN"
## [1] 92
## [1] 95
## [1] 2045
## [1] "data"
## [1] "EN"
## [1] 93
## [1] 96
## [1] 2056
## [1] "data"
## [1] "EN"
## [1] 94
## [1] 97
## [1] 2067
## [1] "data"
## [1] "EN"
## [1] 95
## [1] 98
## [1] 2078
## [1] "data"
## [1] "EN"
## [1] 96
## [1] 99
## [1] 2089
## [1] "data"
## [1] "EN"
## [1] 97
## [1] 100
## [1] 2100
## [1] "data"
## [1] "EN"
## [1] 98
## [1] 101
## [1] 2111
## [1] "data"
## [1] "EN"
## [1] 99
## [1] 102
## [1] 2122
## [1] "data"
## [1] "EN"
## [1] 1
## [1] 0
## [1] 2133
## [1] "data"
## [1] "iTwiner"
## [1] 2
## [1] 1
## [1] 2144
## [1] "data"
## [1] "iTwiner"
## [1] 3
## [1] 2
## [1] 2155
## [1] "data"
## [1] "iTwiner"
## [1] 4
## [1] 3
## [1] 2166
## [1] "data"
## [1] "iTwiner"
## [1] 5
## [1] 4
## [1] 2177
## [1] "data"
## [1] "iTwiner"
## [1] 6
## [1] 5
## [1] 2188
## [1] "data"
## [1] "iTwiner"
## [1] 7
## [1] 6
## [1] 2199
## [1] "data"
## [1] "iTwiner"
## [1] 8
## [1] 7
## [1] 2210
## [1] "data"
## [1] "iTwiner"
## [1] 9
## [1] 8
## [1] 2221
## [1] "data"
## [1] "iTwiner"
## [1] 10
## [1] 9
## [1] 2232
## [1] "data"
## [1] "iTwiner"
## [1] 11
## [1] 10
## [1] 2243
## [1] "data"
## [1] "iTwiner"
## [1] 12
## [1] 11
## [1] 2254
## [1] "data"
## [1] "iTwiner"
## [1] 13
## [1] 12
## [1] 2265
## [1] "data"
## [1] "iTwiner"
## [1] 14
## [1] 13
## [1] 2276
## [1] "data"
## [1] "iTwiner"
## [1] 15
## [1] 14
## [1] 2287
## [1] "data"
## [1] "iTwiner"
## [1] 16
## [1] 15
## [1] 2298
## [1] "data"
## [1] "iTwiner"
## [1] 17
## [1] 16
## [1] 2309
## [1] "data"
## [1] "iTwiner"
## [1] 18
## [1] 17
## [1] 2320
## [1] "data"
## [1] "iTwiner"
## [1] 19
## [1] 18
## [1] 2331
## [1] "data"
## [1] "iTwiner"
## [1] 20
## [1] 19
## [1] 2342
## [1] "data"
## [1] "iTwiner"
## [1] 21
## [1] 20
## [1] 2353
## [1] "data"
## [1] "iTwiner"
## [1] 22
## [1] 21
## [1] 2364
## [1] "data"
## [1] "iTwiner"
## [1] 23
## [1] 22
## [1] 2375
## [1] "data"
## [1] "iTwiner"
## [1] 24
## [1] 23
## [1] 2386
## [1] "data"
## [1] "iTwiner"
## [1] 25
## [1] 24
## [1] 2397
## [1] "data"
## [1] "iTwiner"
## [1] 26
## [1] 25
## [1] 2408
## [1] "data"
## [1] "iTwiner"
## [1] 27
## [1] 26
## [1] 2419
## [1] "data"
## [1] "iTwiner"
## [1] 28
## [1] 27
## [1] 2430
## [1] "data"
## [1] "iTwiner"
## [1] 29
## [1] 28
## [1] 2441
## [1] "data"
## [1] "iTwiner"
## [1] 30
## [1] 29
## [1] 2452
## [1] "data"
## [1] "iTwiner"
## [1] 31
## [1] 30
## [1] 2463
## [1] "data"
## [1] "iTwiner"
## [1] 32
## [1] 31
## [1] 2474
## [1] "data"
## [1] "iTwiner"
## [1] 33
## [1] 32
## [1] 2485
## [1] "data"
## [1] "iTwiner"
## [1] 34
## [1] 33
## [1] 2496
## [1] "data"
## [1] "iTwiner"
## [1] 35
## [1] 34
## [1] 2507
## [1] "data"
## [1] "iTwiner"
## [1] 36
## [1] 35
## [1] 2518
## [1] "data"
## [1] "iTwiner"
## [1] 37
## [1] 36
## [1] 2529
## [1] "data"
## [1] "iTwiner"
## [1] 38
## [1] 37
## [1] 2540
## [1] "data"
## [1] "iTwiner"
## [1] 39
## [1] 38
## [1] 2551
## [1] "data"
## [1] "iTwiner"
## [1] 40
## [1] 39
## [1] 2562
## [1] "data"
## [1] "iTwiner"
## [1] 41
## [1] 40
## [1] 2573
## [1] "data"
## [1] "iTwiner"
## [1] 42
## [1] 41
## [1] 2584
## [1] "data"
## [1] "iTwiner"
## [1] 43
## [1] 42
## [1] 2595
## [1] "data"
## [1] "iTwiner"
## [1] 44
## [1] 43
## [1] 2606
## [1] "data"
## [1] "iTwiner"
## [1] 45
## [1] 44
## [1] 2617
## [1] "data"
## [1] "iTwiner"
## [1] 46
## [1] 45
## [1] 2628
## [1] "data"
## [1] "iTwiner"
## [1] 47
## [1] 46
## [1] 2639
## [1] "data"
## [1] "iTwiner"
## [1] 48
## [1] 47
## [1] 2650
## [1] "data"
## [1] "iTwiner"
## [1] 49
## [1] 48
## [1] 2661
## [1] "data"
## [1] "iTwiner"
## [1] 50
## [1] 49
## [1] 2672
## [1] "data"
## [1] "iTwiner"
## [1] 51
## [1] 50
## [1] 2683
## [1] "data"
## [1] "iTwiner"
## [1] 52
## [1] 51
## [1] 2694
## [1] "data"
## [1] "iTwiner"
## [1] 53
## [1] 52
## [1] 2705
## [1] "data"
## [1] "iTwiner"
## [1] 54
## [1] 53
## [1] 2716
## [1] "data"
## [1] "iTwiner"
## [1] 55
## [1] 54
## [1] 2727
## [1] "data"
## [1] "iTwiner"
## [1] 56
## [1] 55
## [1] 2738
## [1] "data"
## [1] "iTwiner"
## [1] 57
## [1] 56
## [1] 2749
## [1] "data"
## [1] "iTwiner"
## [1] 58
## [1] 57
## [1] 2760
## [1] "data"
## [1] "iTwiner"
## [1] 59
## [1] 58
## [1] 2771
## [1] "data"
## [1] "iTwiner"
## [1] 60
## [1] 59
## [1] 2782
## [1] "data"
## [1] "iTwiner"
## [1] 61
## [1] 60
## [1] 2793
## [1] "data"
## [1] "iTwiner"
## [1] 62
## [1] 61
## [1] 2804
## [1] "data"
## [1] "iTwiner"
## [1] 63
## [1] 62
## [1] 2815
## [1] "data"
## [1] "iTwiner"
## [1] 64
## [1] 63
## [1] 2826
## [1] "data"
## [1] "iTwiner"
## [1] 65
## [1] 64
## [1] 2837
## [1] "data"
## [1] "iTwiner"
## [1] 66
## [1] 65
## [1] 2848
## [1] "data"
## [1] "iTwiner"
## [1] 67
## [1] 66
## [1] 2859
## [1] "data"
## [1] "iTwiner"
## [1] 68
## [1] 67
## [1] 2870
## [1] "data"
## [1] "iTwiner"
## [1] 69
## [1] 68
## [1] 2881
## [1] "data"
## [1] "iTwiner"
## [1] 70
## [1] 69
## [1] 2892
## [1] "data"
## [1] "iTwiner"
## [1] 71
## [1] 70
## [1] 2903
## [1] "data"
## [1] "iTwiner"
## [1] 72
## [1] 71
## [1] 2914
## [1] "data"
## [1] "iTwiner"
## [1] 73
## [1] 72
## [1] 2925
## [1] "data"
## [1] "iTwiner"
## [1] 74
## [1] 73
## [1] 2936
## [1] "data"
## [1] "iTwiner"
## [1] 75
## [1] 74
## [1] 2947
## [1] "data"
## [1] "iTwiner"
## [1] 76
## [1] 75
## [1] 2958
## [1] "data"
## [1] "iTwiner"
## [1] 77
## [1] 76
## [1] 2969
## [1] "data"
## [1] "iTwiner"
## [1] 78
## [1] 77
## [1] 2980
## [1] "data"
## [1] "iTwiner"
## [1] 79
## [1] 78
## [1] 2991
## [1] "data"
## [1] "iTwiner"
## [1] 80
## [1] 79
## [1] 3002
## [1] "data"
## [1] "iTwiner"
## [1] 81
## [1] 80
## [1] 3013
## [1] "data"
## [1] "iTwiner"
## [1] 82
## [1] 81
## [1] 3024
## [1] "data"
## [1] "iTwiner"
## [1] 83
## [1] 82
## [1] 3035
## [1] "data"
## [1] "iTwiner"
## [1] 84
## [1] 83
## [1] 3046
## [1] "data"
## [1] "iTwiner"
## [1] 85
## [1] 84
## [1] 3057
## [1] "data"
## [1] "iTwiner"
## [1] 86
## [1] 85
## [1] 3068
## [1] "data"
## [1] "iTwiner"
## [1] 87
## [1] 86
## [1] 3079
## [1] "data"
## [1] "iTwiner"
## [1] 88
## [1] 87
## [1] 3090
## [1] "data"
## [1] "iTwiner"
## [1] 89
## [1] 88
## [1] 3101
## [1] "data"
## [1] "iTwiner"
## [1] 90
## [1] 89
## [1] 3112
## [1] "data"
## [1] "iTwiner"
## [1] 91
## [1] 90
## [1] 3123
## [1] "data"
## [1] "iTwiner"
## [1] 92
## [1] 91
## [1] 3134
## [1] "data"
## [1] "iTwiner"
## [1] 93
## [1] 92
## [1] 3145
## [1] "data"
## [1] "iTwiner"
## [1] 94
## [1] 93
## [1] 3156
## [1] "data"
## [1] "iTwiner"
## [1] 95
## [1] 94
## [1] 3167
## [1] "data"
## [1] "iTwiner"
## [1] 96
## [1] 95
## [1] 3178
## [1] "data"
## [1] "iTwiner"
## [1] 97
## [1] 96
## [1] 3189
## [1] "data"
## [1] "iTwiner"
## [1] 98
## [1] 97
## [1] 3200
## [1] "data"
## [1] "iTwiner"
## [1] 99
## [1] 98
## [1] 3211
## [1] "data"
## [1] "iTwiner"
## [1] 100
## [1] 99
## [1] 3222
## [1] "data"
## [1] "iTwiner"
3.1.5.2.1 Measures of model performance
runs_en_tr <- which(acc_cox_tr[1,]!=0)
runs_itw_tr <- which(acc_cox_tr[2,]!=0)

runs_en_ts <- which(acc_cox_tes[1,]!=0)
runs_itw_ts <- which(acc_cox_tes[2,]!=0)
  • number variables selected
# median number of variables selected
nvar_en <- nvar_selected_1[1,]
nvar_en <- nvar_en[runs_en_tr]
mean(nvar_en)
## [1] 66.97
median(nvar_en)
## [1] 59
sd(nvar_en)
## [1] 32.63092
nvar_tw <- nvar_selected_1[2,]
nvar_tw <- nvar_tw[runs_itw_tr]
mean(nvar_tw)
## [1] 36.54
median(nvar_tw)
## [1] 33
sd(nvar_tw)
## [1] 21.97539
  • Accuracy
# EN train
acc_cox_tr_EN <- acc_cox_tr[1,]
acc_cox_tr_EN <- acc_cox_tr_EN[runs_en_tr]
#hist(acc_cox_tr_EN)
mean(acc_cox_tr_EN)  
## [1] 0.867381
median(acc_cox_tr_EN)
## [1] 0.8571429
sd(acc_cox_tr_EN)
## [1] 0.03285491
# EN test
acc_cox_tes_EN <- acc_cox_tes[1,]
acc_cox_tes_EN <- acc_cox_tes_EN[runs_en_ts]
#hist(acc_cox_tes_EN)
mean(acc_cox_tes_EN)
## [1] 0.6833333
median(acc_cox_tes_EN)
## [1] 0.6666667
sd(acc_cox_tes_EN)
## [1] 0.09259259
#iTwiner train
acc_cox_tr_iTwiner <- acc_cox_tr[2,]
acc_cox_tr_iTwiner <- acc_cox_tr_iTwiner[runs_itw_tr]
#hist(acc_cox_tr_iTwiner)
mean(acc_cox_tr_iTwiner)
## [1] 0.8511905
median(acc_cox_tr_iTwiner)
## [1] 0.8571429
sd(acc_cox_tr_iTwiner)
## [1] 0.02918513
#iTwiner test
acc_cox_tes_iTwiner <- acc_cox_tes[2,]
acc_cox_tes_iTwiner <- acc_cox_tes_iTwiner[runs_itw_ts]
#hist(acc_cox_tes_iTwiner)
mean(acc_cox_tes_iTwiner)
## [1] 0.7938889
median(acc_cox_tes_iTwiner)
## [1] 0.7777778
sd(acc_cox_tes_iTwiner)
## [1] 0.07461712
  • miscl
# EN train
miscl_cox_tr_EN <- miscl_cox_tr[1,]
miscl_cox_tr_EN <- miscl_cox_tr_EN[runs_en_tr]
mean(miscl_cox_tr_EN)  
## [1] 5.57
median(miscl_cox_tr_EN)
## [1] 6
sd(miscl_cox_tr_EN)
## [1] 1.379906
# EN test
miscl_cox_tes_EN <- miscl_cox_tes[1,]
miscl_cox_tes_EN <- miscl_cox_tes_EN[runs_en_ts]
mean(miscl_cox_tes_EN)
## [1] 5.7
median(miscl_cox_tes_EN)
## [1] 6
sd(miscl_cox_tes_EN)
## [1] 1.666667
#iTwiner train
miscl_cox_tr_iTwiner <- miscl_cox_tr[2,]
miscl_cox_tr_iTwiner <- miscl_cox_tr_iTwiner[runs_itw_tr]
mean(miscl_cox_tr_iTwiner)
## [1] 6.25
median(miscl_cox_tr_iTwiner)
## [1] 6
sd(miscl_cox_tr_iTwiner)
## [1] 1.225775
#iTwiner test
miscl_cox_tes_iTwiner <- miscl_cox_tes[2,]
miscl_cox_tes_iTwiner <- miscl_cox_tes_iTwiner[runs_itw_ts]
mean(miscl_cox_tes_iTwiner)
## [1] 3.71
median(miscl_cox_tes_iTwiner)
## [1] 4
sd(miscl_cox_tes_iTwiner)
## [1] 1.343108
  • False Neg
#EN
fneg_reg_en_train <- fneg_reg_tr[1,runs_en_tr]
mean(fneg_reg_en_train)
## [1] 5.39
median(fneg_reg_en_train)
## [1] 5.5
sd(fneg_reg_en_train)
## [1] 1.270449
fneg_reg_en_test <- fneg_reg_ts[1,runs_en_ts]
mean(fneg_reg_en_test)
## [1] 3.1
median(fneg_reg_en_test)
## [1] 3
sd(fneg_reg_en_test)
## [1] 1.13262
#iTwiner
fneg_reg_iTwiner_train <- fneg_reg_tr[2,runs_itw_tr]
mean(fneg_reg_iTwiner_train)
## [1] 6.25
median(fneg_reg_iTwiner_train)
## [1] 6
sd(fneg_reg_iTwiner_train)
## [1] 1.225775
fneg_reg_iTwiner_test <- fneg_reg_ts[2,runs_itw_ts]
mean(fneg_reg_iTwiner_test)
## [1] 3.62
median(fneg_reg_iTwiner_test)
## [1] 4
sd(fneg_reg_iTwiner_test)
## [1] 1.361669
  • sensitivity
# EN train
sensitivity_cox_tr_EN <- sensitivity_cox_tr[1,]
sensitivity_cox_tr_EN <- sensitivity_cox_tr_EN[runs_en_tr]
mean(sensitivity_cox_tr_EN)  
## [1] 0.7433333
median(sensitivity_cox_tr_EN)
## [1] 0.7380952
sd(sensitivity_cox_tr_EN)
## [1] 0.06049757
# EN test
sensitivity_cox_tes_EN <- sensitivity_cox_tes[1,]
sensitivity_cox_tes_EN <- sensitivity_cox_tes_EN[runs_en_ts]
mean(sensitivity_cox_tes_EN)
## [1] 0.6555556
median(sensitivity_cox_tes_EN)
## [1] 0.6666667
sd(sensitivity_cox_tes_EN)
## [1] 0.1258467
#iTwiner train
sensitivity_cox_tr_iTwiner <- sensitivity_cox_tr[2,]
sensitivity_cox_tr_iTwiner <- sensitivity_cox_tr_iTwiner[runs_itw_tr]
mean(sensitivity_cox_tr_iTwiner)
## [1] 0.702381
median(sensitivity_cox_tr_iTwiner)
## [1] 0.7142857
sd(sensitivity_cox_tr_iTwiner)
## [1] 0.05837026
#iTwiner test
sensitivity_cox_tes_iTwiner <- sensitivity_cox_tes[2,]
sensitivity_cox_tes_iTwiner <- sensitivity_cox_tes_iTwiner[runs_itw_ts]
mean(sensitivity_cox_tes_iTwiner)
## [1] 0.5977778
median(sensitivity_cox_tes_iTwiner)
## [1] 0.5555556
sd(sensitivity_cox_tes_iTwiner)
## [1] 0.1512965
  • specificity
# EN train
specificity_cox_tr_EN <- specificity_cox_tr[1,]
specificity_cox_tr_EN <- specificity_cox_tr_EN[runs_en_tr]
mean(specificity_cox_tr_EN)  
## [1] 0.9914286
median(specificity_cox_tr_EN)
## [1] 1
sd(specificity_cox_tr_EN)
## [1] 0.02180604
# EN test
specificity_cox_tes_EN <- specificity_cox_tes[1,]
specificity_cox_tes_EN <- specificity_cox_tes_EN[runs_en_ts]
mean(specificity_cox_tes_EN)
## [1] 0.7111111
median(specificity_cox_tes_EN)
## [1] 0.6666667
sd(specificity_cox_tes_EN)
## [1] 0.1421338
#iTwiner train
specificity_cox_tr_iTwiner <- specificity_cox_tr[2,]
specificity_cox_tr_iTwiner <- specificity_cox_tr_iTwiner[runs_itw_tr]
mean(specificity_cox_tr_iTwiner)
## [1] 1
median(specificity_cox_tr_iTwiner)
## [1] 1
sd(specificity_cox_tr_iTwiner)
## [1] 0
#iTwiner test
specificity_cox_tes_iTwiner <- specificity_cox_tes[2,]
specificity_cox_tes_iTwiner <- specificity_cox_tes_iTwiner[runs_itw_ts]
mean(specificity_cox_tes_iTwiner)
## [1] 0.99
median(specificity_cox_tes_iTwiner)
## [1] 1
sd(specificity_cox_tes_iTwiner)
## [1] 0.03564733
  • auc
# EN train
auc_cox_tr_EN <- auc_cox_tr[1,]
auc_cox_tr_EN <- auc_cox_tr_EN[runs_en_tr]
mean(auc_cox_tr_EN)  
## [1] 0.867381
median(auc_cox_tr_EN)
## [1] 0.8571429
sd(auc_cox_tr_EN)
## [1] 0.03285491
# EN test
auc_cox_tes_EN <- auc_cox_tes[1,]
auc_cox_tes_EN <- auc_cox_tes_EN[runs_en_ts]
mean(auc_cox_tes_EN)
## [1] 0.6833333
median(auc_cox_tes_EN)
## [1] 0.6666667
sd(auc_cox_tes_EN)
## [1] 0.09259259
#iTwiner train
auc_cox_tr_iTwiner <- auc_cox_tr[2,]
auc_cox_tr_iTwiner <- auc_cox_tr_iTwiner[runs_itw_tr]
mean(auc_cox_tr_iTwiner)
## [1] 0.8511905
median(auc_cox_tr_iTwiner)
## [1] 0.8571429
sd(auc_cox_tr_iTwiner)
## [1] 0.02918513
#iTwiner test
auc_cox_tes_iTwiner <- auc_cox_tes[2,]
auc_cox_tes_iTwiner <- auc_cox_tes_iTwiner[runs_itw_ts]
mean(auc_cox_tes_iTwiner)
## [1] 0.7938889
median(auc_cox_tes_iTwiner)
## [1] 0.7777778
sd(auc_cox_tes_iTwiner)
## [1] 0.07461712
  • Names of genes selected

Variables always selected

var_selected_alw_select_en <- var_selected_en1[runs_en_ts]
var_selected_alw_select_en1 <-  Reduce(intersect,var_selected_alw_select_en)
print(paste("variables always selected by EN = ",length(var_selected_alw_select_en1)))
## [1] "variables always selected by EN =  0"
var_selected_alw_select_iTwiner <- var_selected_iTwiner1[runs_itw_ts]
var_selected_alw_select_iTwiner1 <-  Reduce(intersect,var_selected_alw_select_iTwiner)
print(paste("variables always selected by iTwiner = ",length(var_selected_alw_select_iTwiner1)))
## [1] "variables always selected by iTwiner =  2"

Variables selected in 50 bootstrap samples

l = length(var_selected_alw_select_en)
var_selected_50_select_en <- table(unlist(var_selected_alw_select_en))
var_selected_50_select_en <- as.data.frame(var_selected_50_select_en)
var_selected_50_select_en <-  subset(var_selected_50_select_en, Freq > 0.50*l)
print(paste("variables selected 50% by EN = ",length(var_selected_50_select_en$Var1)))
## [1] "variables selected 50% by EN =  8"
var_selected_50_select_en$Var1
## [1] CD8B      LINC01871 PF4       RPL6P9    SFRP5     SLITRK1   SOHLH2   
## [8] XRCC6P2  
## 1815 Levels: AASDHPPT ABCC12 ABCD1P4 ABCE1 ABCF2-H2BE1 ABHD15 ABHD6 ACLY ... ZWINT
# 
l = length(var_selected_alw_select_iTwiner)
var_selected_50_select_iTwiner <- table(unlist(var_selected_alw_select_iTwiner))
var_selected_50_select_iTwiner <- as.data.frame(var_selected_50_select_iTwiner)
var_selected_50_select_iTwiner <-  subset(var_selected_50_select_iTwiner, Freq > 0.50*l)
print(paste("variables selected 50% by iTwiner = ",length(var_selected_50_select_iTwiner$Var1)))
## [1] "variables selected 50% by iTwiner =  19"
var_selected_50_select_iTwiner$Var1
##  [1] COX6CP7    EEF1B2P6   FABP7P2    HNRNPA1P42 HSPD1P7    IGKV2D-30 
##  [7] KDM4F      LILRB1-AS1 LINC01100  LINC01335  MTCO2P18   OR51K1P   
## [13] RAC1P3     RPL31P35   RPL7P58    SULT6B2P   TRAJ20     TRBV11-1  
## [19] XRCC6P2   
## 236 Levels: ACTN4P1 AGTR2 ALOX15P2 AMMECR1LP1 ANKRD20A10P APOOP4 ... XRCC6P2
  • Select the 50 most frequent variables
var_selected_50_select_en <- table(unlist(var_selected_alw_select_en))
var_selected_50_select_en <- as.data.frame(var_selected_50_select_en)
var_selected_50_select_en <- var_selected_50_select_en[order(var_selected_50_select_en$Freq, decreasing = T),]
hist(var_selected_50_select_en$Freq)

top100_en <- var_selected_50_select_en[1:100,]
top100_en <- top100_en$Var1
top50_en <- top100_en[1:50]
top100_en <- as.data.frame(top100_en)
write_xlsx(top100_en,"List_top100_en_d1.xlsx")

var_selected_50_select_iTwiner <- table(unlist(var_selected_alw_select_iTwiner))
var_selected_50_select_iTwiner <- as.data.frame(var_selected_50_select_iTwiner)
var_selected_50_select_iTwiner <- var_selected_50_select_iTwiner[order(var_selected_50_select_iTwiner$Freq, decreasing = T),]
hist(var_selected_50_select_iTwiner$Freq)

top100_itw <- var_selected_50_select_iTwiner[1:100,]
top100_itw <- top100_itw$Var1
top50_itw <- top100_itw[1:50]
top100_itw <- as.data.frame(top100_itw)
write_xlsx(top100_itw,"List_top100_itw_d1.xlsx")

variables in common between EN and iTwiner

common_var_selected_50_en_iTwiner <- var_selected_50_select_iTwiner$Var1[which(var_selected_50_select_iTwiner$Var1 %in% var_selected_50_select_en$Var1)]
length(common_var_selected_50_en_iTwiner)
## [1] 44
common_var_selected_50_en_iTwiner
##  [1] RAC1P3     XRCC6P2    OR51K1P    LINC01100  EEF1B2P6   HSPD1P7   
##  [7] FABP7P2    RPL31P35   COX6CP7    TRAJ20     RNU6-1085P MIR8052   
## [13] TRBJ2-1    DNAJC19P3  RNU6-337P  TRAJ61     MIR6816    TIMM8AP1  
## [19] TRAJ25     MTCO3P42   RNU4-39P   TMSB4XP1   HNRNPCL1   LINC02178 
## [25] EIF4EP3    IGHJ1      MIR193B    MIR4439    TRAJ28     ATP5MGP5  
## [31] ATP6V1E1P2 MIR1288    MIR564     MIRLET7A2  MIR4254    BCRP7     
## [37] GDI2P1     ISCA1P3    MIR409     PMPCAP1    RNA5SP74   RNU6-865P 
## [43] RNU6ATAC7P SYCN      
## 236 Levels: ACTN4P1 AGTR2 ALOX15P2 AMMECR1LP1 ANKRD20A10P APOOP4 ... XRCC6P2
  • Variables selected by EN and iTwiner that are DEGs
xdata <- rnaseq1[ order(row.names(rnaseq1)), ]

nomesgenes <- c(as.vector(top50_en),as.vector(top50_itw))
xdata <- xdata[,nomesgenes]
rownames(DATASET1_bal) <- DATASET1_bal$ID
ydata <- as.data.frame(DATASET1_bal[ order(row.names(DATASET1_bal)), ])
rownames(ydata) <- ydata$ID

xdata <- xdata[rownames(xdata) %in% 
                 rownames(ydata),]
ydata <- as.data.frame(ydata[rownames(ydata) %in% 
                               rownames(xdata),])

# keep features with standard deviation > 0
xdata <- xdata[,sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])}) != 0]
xdata <- t(xdata)

group <- as.factor(ydata$class)
class <- as.data.frame(ydata$class)
edgeR.DGElist <- DGEList(counts=xdata, group = group)

# remove genes that do not have one count per million in at least 5 samples
keep <- rowSums(cpm(edgeR.DGElist) >= 1) >= 5
edgeR.DGElist <- edgeR.DGElist[keep ,]

# specify the design setup 

design <- model.matrix(~group)

# estimate the dispersion for all read counts across all samples
edgeR.DGElist <- estimateDisp(edgeR.DGElist, design)

# fit the negative binomial model
edger_fit <- glmFit(edgeR.DGElist, design )

# perform the testing for every gene using the neg. binomial model
edger_lrt <- glmLRT(edger_fit)
summary(decideTests(edger_lrt))
##        groupPm
## Down         4
## NotSig      93
## Up           3
# extract results from edger _lrt$ table plus adjusted p- values
DGE.results_edgeR <- topTags(edger_lrt, n = Inf , sort.by = "PValue" , adjust.method = "BH" )

topTags(DGE.results_edgeR) #table with the top10 DEGs
## Coefficient:  groupPm 
##                 logFC    logCPM        LR       PValue          FDR
## MIR8052     2.7372046  6.009474 19.669113 9.207769e-06 0.0006421211
## RNU6-337P   3.2286673  6.017764 19.033788 1.284242e-05 0.0006421211
## RAC1P3     -2.9946864  6.419308 13.309733 2.640319e-04 0.0066007975
## RAC1P3.1   -2.9946864  6.419308 13.309733 2.640319e-04 0.0066007975
## XRCC6P2    -3.3430257  6.364429 10.601001 1.130265e-03 0.0188377469
## XRCC6P2.1  -3.3430257  6.364429 10.601001 1.130265e-03 0.0188377469
## RNU6-1266P  3.1010187  7.350505  9.753307 1.790004e-03 0.0255714897
## MGST1       0.6999765 16.087821  6.929356 8.479222e-03 0.1059902710
## MIR7107     2.8806668  7.217542  6.048031 1.392181e-02 0.1546868129
## EEF1B2P6   -2.0438206  6.049605  5.732342 1.665525e-02 0.1612076403
genes_deg <- DGE.results_edgeR$table
genes_deg <- genes_deg[which(genes_deg$FDR < 0.05),]
dim(genes_deg)#genes found to be differentially expressed
## [1] 7 5
genes_deg$row <- row.names(genes_deg)
write_xlsx(genes_deg,"genes_deg_enitw_d1.xlsx")

# highly expressed genes
high <- genes_deg[genes_deg$logFC > 0,]
dim(high)
## [1] 3 6
#low expressed genes
low <- genes_deg[genes_deg$logFC < 0,]
dim(low)
## [1] 4 6

3.1.5.3 3) Classification based on genes selected by regularized logistic regression

3.1.5.3.1 EN + Classifiers
xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

#ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
colnames(ydata) <- c("class","id")

names <- as.vector(top50_en)
xdata_en <- as.data.frame(xdata[,names])

nomesgenes <- colnames(xdata_en)
colnames(xdata_en) <- paste0("Var", 1:50)
xdata_en$type <- as.factor(ydata$class)
ydata$class <- as.numeric(ydata$class)
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.1.5.3.1.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_enplus_train[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.9902381
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01439552
acc_svm <- acc_enplus_train[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.9288095
median(acc_svm)
## [1] 0.9285714
sd(acc_svm)
## [1] 0.02356658
acc_svmR <- acc_enplus_train[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.8940476
median(acc_svmR)
## [1] 0.9285714
sd(acc_svmR)
## [1] 0.08748996
acc_logs <- acc_enplus_train[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_enplus_train[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 1
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0
  • auc
# auc
auc_trees <- auc_enplus_train[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.9902381
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01439552
auc_svm <- auc_enplus_train[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.9288095
median(auc_svm)
## [1] 0.9285714
sd(auc_svm)
## [1] 0.02356658
auc_svmR <- auc_enplus_train[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.8940476
median(auc_svmR)
## [1] 0.9285714
sd(auc_svmR)
## [1] 0.08748996
auc_logs <- auc_enplus_train[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_enplus_train[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 1
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0
  • Misclassification
# miscl
miscl_trees <- miscl_enplus_train[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 0.41
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.6046119
miscl_svm <- miscl_enplus_train[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 2.99
median(miscl_svm)
## [1] 3
sd(miscl_svm)
## [1] 0.9897964
miscl_svmR <- miscl_enplus_train[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 4.45
median(miscl_svmR)
## [1] 3
sd(miscl_svmR)
## [1] 3.674578
miscl_logs <- miscl_enplus_train[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_enplus_train[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 0
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_enplus_train[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.99
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.01949334
sensitivity_svm <- sensitivity_enplus_train[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.857619
median(sensitivity_svm)
## [1] 0.8571429
sd(sensitivity_svm)
## [1] 0.04713316
sensitivity_svmR <- sensitivity_enplus_train[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.8771429
median(sensitivity_svmR)
## [1] 0.8571429
sd(sensitivity_svmR)
## [1] 0.06841653
sensitivity_logs <- sensitivity_enplus_train[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_enplus_train[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 1
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0
  • Specificity
# specificity
specificity_trees <- specificity_enplus_train[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.9904762
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.01914358
specificity_svm <- specificity_enplus_train[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 1
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0
specificity_svmR <- specificity_enplus_train[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.9109524
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.1542892
specificity_logs <- specificity_enplus_train[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_enplus_train[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_enplus_train[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 0.21
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.4093602
fneg_svm <- fneg_enplus_train[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 2.99
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 0.9897964
fneg_svmR <- fneg_enplus_train[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 2.58
median(fneg_svmR)
## [1] 3
sd(fneg_svmR)
## [1] 1.436747
fneg_logs <- fneg_enplus_train[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_enplus_train[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 0
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0
3.1.5.3.1.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc_enplus[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.7133333
median(acc_trees)
## [1] 0.7222222
sd(acc_trees)
## [1] 0.0943338
acc_svm <- acc_enplus[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.7166667
median(acc_svm)
## [1] 0.7222222
sd(acc_svm)
## [1] 0.08148913
acc_svmR <- acc_enplus[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.7544444
median(acc_svmR)
## [1] 0.7777778
sd(acc_svmR)
## [1] 0.1008691
acc_logs <- acc_enplus[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.7094444
median(acc_logs)
## [1] 0.7222222
sd(acc_logs)
## [1] 0.09762583
acc_rf <- acc_enplus[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.7905556
median(acc_rf)
## [1] 0.7777778
sd(acc_rf)
## [1] 0.0956906
  • auc
# auc
auc_trees <- auc_enplus[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.7133333
median(auc_trees)
## [1] 0.7222222
sd(auc_trees)
## [1] 0.0943338
auc_svm <- auc_enplus[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.7166667
median(auc_svm)
## [1] 0.7222222
sd(auc_svm)
## [1] 0.08148913
auc_svmR <- auc_enplus[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.7544444
median(auc_svmR)
## [1] 0.7777778
sd(auc_svmR)
## [1] 0.1008691
auc_logs <- auc_enplus[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.7094444
median(auc_logs)
## [1] 0.7222222
sd(auc_logs)
## [1] 0.09762583
auc_rf <- auc_enplus[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.7905556
median(auc_rf)
## [1] 0.7777778
sd(auc_rf)
## [1] 0.0956906
  • Misclassification
# miscl
miscl_trees <- miscl_enplus[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 5.16
median(miscl_trees)
## [1] 5
sd(miscl_trees)
## [1] 1.698008
miscl_svm <- miscl_enplus[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 5.1
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.466804
miscl_svmR <- miscl_enplus[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 4.42
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 1.815645
miscl_logs <- miscl_enplus[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 5.23
median(miscl_logs)
## [1] 5
sd(miscl_logs)
## [1] 1.757265
miscl_rf <- miscl_enplus[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 3.77
median(miscl_rf)
## [1] 4
sd(miscl_rf)
## [1] 1.722431
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_enplus[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.7233333
median(sensitivity_trees)
## [1] 0.7777778
sd(sensitivity_trees)
## [1] 0.1390246
sensitivity_svm <- sensitivity_enplus[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.5888889
median(sensitivity_svm)
## [1] 0.5555556
sd(sensitivity_svm)
## [1] 0.1598884
sensitivity_svmR <- sensitivity_enplus[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.8133333
median(sensitivity_svmR)
## [1] 0.7777778
sd(sensitivity_svmR)
## [1] 0.1159228
sensitivity_logs <- sensitivity_enplus[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.6688889
median(sensitivity_logs)
## [1] 0.6666667
sd(sensitivity_logs)
## [1] 0.139459
sensitivity_rf <- sensitivity_enplus[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.7077778
median(sensitivity_rf)
## [1] 0.6666667
sd(sensitivity_rf)
## [1] 0.126986
  • Specificity
# specificity
specificity_trees <- specificity_enplus[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.7033333
median(specificity_trees)
## [1] 0.6666667
sd(specificity_trees)
## [1] 0.1595409
specificity_svm <- specificity_enplus[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.8444444
median(specificity_svm)
## [1] 0.8888889
sd(specificity_svm)
## [1] 0.1181813
specificity_svmR <- specificity_enplus[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.6955556
median(specificity_svmR)
## [1] 0.6666667
sd(specificity_svmR)
## [1] 0.1957396
specificity_logs <- specificity_enplus[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.75
median(specificity_logs)
## [1] 0.7777778
sd(specificity_logs)
## [1] 0.1558399
specificity_rf <- specificity_enplus[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 0.8733333
median(specificity_rf)
## [1] 0.8888889
sd(specificity_rf)
## [1] 0.127344
  • False Neg
# fneg
fneg_trees <- fneg_enplus[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 2.49
median(fneg_trees)
## [1] 2
sd(fneg_trees)
## [1] 1.251222
fneg_svm <- fneg_enplus[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 3.7
median(fneg_svm)
## [1] 4
sd(fneg_svm)
## [1] 1.438995
fneg_svmR <- fneg_enplus[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 1.68
median(fneg_svmR)
## [1] 2
sd(fneg_svmR)
## [1] 1.043305
fneg_logs <- fneg_enplus[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 2.98
median(fneg_logs)
## [1] 3
sd(fneg_logs)
## [1] 1.255131
fneg_rf <- fneg_enplus[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 2.63
median(fneg_rf)
## [1] 3
sd(fneg_rf)
## [1] 1.142874
3.1.5.3.2 iTwiner + classifiers
xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

#ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
colnames(ydata) <- c("class","id")

names <- as.vector(top50_itw)
xdata_iTwiner <- as.data.frame(xdata[,names])

nomesgenes <- colnames(xdata_iTwiner)
colnames(xdata_iTwiner) <- paste0("Var", 1:50)
xdata_iTwiner$type <- as.factor(ydata$class)
ydata$class <- as.numeric(ydata$class)
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.1.5.3.2.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_tcoxplus_train[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.9952381
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01070159
acc_svm <- acc_tcoxplus_train[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.9090476
median(acc_svm)
## [1] 0.9047619
sd(acc_svm)
## [1] 0.02449237
acc_svmR <- acc_tcoxplus_train[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.9178571
median(acc_svmR)
## [1] 0.9166667
sd(acc_svmR)
## [1] 0.05502478
acc_logs <- acc_tcoxplus_train[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.9985714
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0.005682935
acc_rf <- acc_tcoxplus_train[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.9695238
median(acc_rf)
## [1] 0.9761905
sd(acc_rf)
## [1] 0.01590185
  • auc
# auc
auc_trees <- auc_tcoxplus_train[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.9952381
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01070159
auc_svm <- auc_tcoxplus_train[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.9090476
median(auc_svm)
## [1] 0.9047619
sd(auc_svm)
## [1] 0.02449237
auc_svmR <- auc_tcoxplus_train[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.9178571
median(auc_svmR)
## [1] 0.9166667
sd(auc_svmR)
## [1] 0.05502478
auc_logs <- auc_tcoxplus_train[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.9985714
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0.005682935
auc_rf <- auc_tcoxplus_train[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.9695238
median(auc_rf)
## [1] 0.9761905
sd(auc_rf)
## [1] 0.01590185
  • Misclassification
# miscl
miscl_trees <- miscl_tcoxplus_train[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 0.2
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.4494666
miscl_svm <- miscl_tcoxplus_train[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 3.82
median(miscl_svm)
## [1] 4
sd(miscl_svm)
## [1] 1.02868
miscl_svmR <- miscl_tcoxplus_train[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 3.45
median(miscl_svmR)
## [1] 3.5
sd(miscl_svmR)
## [1] 2.311041
miscl_logs <- miscl_tcoxplus_train[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 0.06
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0.2386833
miscl_rf <- miscl_tcoxplus_train[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 1.28
median(miscl_rf)
## [1] 1
sd(miscl_rf)
## [1] 0.6678777
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_tcoxplus_train[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.9985714
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.008164124
sensitivity_svm <- sensitivity_tcoxplus_train[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.8185714
median(sensitivity_svm)
## [1] 0.8095238
sd(sensitivity_svm)
## [1] 0.04913182
sensitivity_svmR <- sensitivity_tcoxplus_train[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.8642857
median(sensitivity_svmR)
## [1] 0.8571429
sd(sensitivity_svmR)
## [1] 0.08188636
sensitivity_logs <- sensitivity_tcoxplus_train[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.997619
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0.01043061
sensitivity_rf <- sensitivity_tcoxplus_train[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.9390476
median(sensitivity_rf)
## [1] 0.952381
sd(sensitivity_rf)
## [1] 0.0318037
  • Specificity
# specificity
specificity_trees <- specificity_tcoxplus_train[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.9919048
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.02036678
specificity_svm <- specificity_tcoxplus_train[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9995238
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.004761905
specificity_svmR <- specificity_tcoxplus_train[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.9714286
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.06015776
specificity_logs <- specificity_tcoxplus_train[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.9995238
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0.004761905
specificity_rf <- specificity_tcoxplus_train[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_tcoxplus_train[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 0.03
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.1714466
fneg_svm <- fneg_tcoxplus_train[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 3.81
median(fneg_svm)
## [1] 4
sd(fneg_svm)
## [1] 1.031768
fneg_svmR <- fneg_tcoxplus_train[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 2.85
median(fneg_svmR)
## [1] 3
sd(fneg_svmR)
## [1] 1.719614
fneg_logs <- fneg_tcoxplus_train[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 0.05
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0.2190429
fneg_rf <- fneg_tcoxplus_train[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 1.28
median(fneg_rf)
## [1] 1
sd(fneg_rf)
## [1] 0.6678777
3.1.5.3.2.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc_tcoxplus[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.7672222
median(acc_trees)
## [1] 0.7777778
sd(acc_trees)
## [1] 0.09563845
acc_svm <- acc_tcoxplus[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.8261111
median(acc_svm)
## [1] 0.8333333
sd(acc_svm)
## [1] 0.07135367
acc_svmR <- acc_tcoxplus[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.7961111
median(acc_svmR)
## [1] 0.7777778
sd(acc_svmR)
## [1] 0.08900631
acc_logs <- acc_tcoxplus[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.7255556
median(acc_logs)
## [1] 0.7222222
sd(acc_logs)
## [1] 0.09370377
acc_rf <- acc_tcoxplus[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.8627778
median(acc_rf)
## [1] 0.8611111
sd(acc_rf)
## [1] 0.06290104
  • auc
# auc
auc_trees <- auc_tcoxplus[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.7672222
median(auc_trees)
## [1] 0.7777778
sd(auc_trees)
## [1] 0.09563845
auc_svm <- auc_tcoxplus[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.8261111
median(auc_svm)
## [1] 0.8333333
sd(auc_svm)
## [1] 0.07135367
auc_svmR <- auc_tcoxplus[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.7961111
median(auc_svmR)
## [1] 0.7777778
sd(auc_svmR)
## [1] 0.08900631
auc_logs <- auc_tcoxplus[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.7255556
median(auc_logs)
## [1] 0.7222222
sd(auc_logs)
## [1] 0.09370377
auc_rf <- auc_tcoxplus[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.8627778
median(auc_rf)
## [1] 0.8611111
sd(auc_rf)
## [1] 0.06290104
  • Misclassification
# miscl
miscl_trees <- miscl_tcoxplus[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 4.19
median(miscl_trees)
## [1] 4
sd(miscl_trees)
## [1] 1.721492
miscl_svm <- miscl_tcoxplus[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 3.13
median(miscl_svm)
## [1] 3
sd(miscl_svm)
## [1] 1.284366
miscl_svmR <- miscl_tcoxplus[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 3.67
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 1.602114
miscl_logs <- miscl_tcoxplus[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 4.94
median(miscl_logs)
## [1] 5
sd(miscl_logs)
## [1] 1.686668
miscl_rf <- miscl_tcoxplus[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 2.47
median(miscl_rf)
## [1] 2.5
sd(miscl_rf)
## [1] 1.132219
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_tcoxplus[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.7255556
median(sensitivity_trees)
## [1] 0.7777778
sd(sensitivity_trees)
## [1] 0.1362705
sensitivity_svm <- sensitivity_tcoxplus[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.6566667
median(sensitivity_svm)
## [1] 0.6666667
sd(sensitivity_svm)
## [1] 0.1430913
sensitivity_svmR <- sensitivity_tcoxplus[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.7933333
median(sensitivity_svmR)
## [1] 0.7777778
sd(sensitivity_svmR)
## [1] 0.1128042
sensitivity_logs <- sensitivity_tcoxplus[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.6433333
median(sensitivity_logs)
## [1] 0.6666667
sd(sensitivity_logs)
## [1] 0.1589457
sensitivity_rf <- sensitivity_tcoxplus[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.7288889
median(sensitivity_rf)
## [1] 0.7777778
sd(sensitivity_rf)
## [1] 0.1226551
  • Specificity
# specificity
specificity_trees <- specificity_tcoxplus[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.8088889
median(specificity_trees)
## [1] 0.7777778
sd(specificity_trees)
## [1] 0.1456695
specificity_svm <- specificity_tcoxplus[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9955556
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.02188293
specificity_svmR <- specificity_tcoxplus[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.7988889
median(specificity_svmR)
## [1] 0.7777778
sd(specificity_svmR)
## [1] 0.1528668
specificity_logs <- specificity_tcoxplus[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.8077778
median(specificity_logs)
## [1] 0.7777778
sd(specificity_logs)
## [1] 0.1300906
specificity_rf <- specificity_tcoxplus[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 0.9966667
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0.02474461
  • False Neg
# fneg
fneg_trees <- fneg_tcoxplus[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 2.47
median(fneg_trees)
## [1] 2
sd(fneg_trees)
## [1] 1.226434
fneg_svm <- fneg_tcoxplus[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 3.09
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 1.287822
fneg_svmR <- fneg_tcoxplus[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 1.86
median(fneg_svmR)
## [1] 2
sd(fneg_svmR)
## [1] 1.015237
fneg_logs <- fneg_tcoxplus[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 3.21
median(fneg_logs)
## [1] 3
sd(fneg_logs)
## [1] 1.430512
fneg_rf <- fneg_tcoxplus[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 2.44
median(fneg_rf)
## [1] 2
sd(fneg_rf)
## [1] 1.103896
# save results

#save.image("~/results1_2010_final.RData")

3.1.6 Data final

dt1 <- acc[1,]
dt_en1 <- acc_enplus[1,]
dt_iTwiner1 <- acc_tcoxplus[1,]
#dt_hub <- acc_hubplus[1,]

acc_dt1 <- as.data.frame(c(dt1,dt_en1, dt_iTwiner1
                           #,dt_hub
))
colnames(acc_dt1) <- "acc"
#acc_dt$group <- "HUB + DT"
#acc_dt$group[1:100] <- "DT"
acc_dt1$group <- "DT"
acc_dt1$group[101:200] <- "EN + DT"
acc_dt1$group[201:300] <- "iTwiner + DT"
acc_dt1<- acc_dt1 %>% mutate_if(is.character,factor)
# acc_dt$group <- ordered(acc_dt$group, levels = c("DT", "EN + DT","iTwiner + DT", "HUB + DT"))
acc_dt1$group <- ordered(acc_dt1$group, levels = c("DT", "EN + DT","iTwiner + DT"))

acc_dt1$dataset <- "DATASET1"



svmL1 <- acc[2,]
svmL_en1 <- acc_enplus[2,]
svmL_iTwiner1 <- acc_tcoxplus[2,]

acc_svmL1 <- as.data.frame(c(svmL1,svmL_en1, svmL_iTwiner1
                             #,svmL_hub
))
colnames(acc_svmL1) <- "acc"
acc_svmL1$group <- "svmL"
acc_svmL1$group[101:200] <- "EN + svmL"
acc_svmL1$group[201:300] <- "iTwiner + svmL"
acc_svmL1<- acc_svmL1 %>% mutate_if(is.character,factor)
acc_svmL1$group <- ordered(acc_svmL1$group, levels = c("svmL", "EN + svmL","iTwiner + svmL"))

acc_svmL1$dataset <- "DATASET1"




svmR1 <- acc[3,]
svmR_en1 <- acc_enplus[3,]
svmR_iTwiner1 <- acc_tcoxplus[3,]

acc_svmR1 <- as.data.frame(c(svmR1,svmR_en1, svmR_iTwiner1
                             #,svmR_hub
))
colnames(acc_svmR1) <- "acc"
acc_svmR1$group <- "svmR"
acc_svmR1$group[101:200] <- "EN + svmR"
acc_svmR1$group[201:300] <- "iTwiner + svmR"
acc_svmR1<- acc_svmR1 %>% mutate_if(is.character,factor)
acc_svmR1$group <- ordered(acc_svmR1$group, levels = c("svmR", "EN + svmR","iTwiner + svmR"))

acc_svmR1$dataset <- "DATASET1"






logist1 <- acc[4,]
logist_en1 <- acc_enplus[4,]
logist_iTwiner1 <- acc_tcoxplus[4,]

acc_logist1 <- as.data.frame(c(logist1,logist_en1, logist_iTwiner1
                             #,logist_hub
))
colnames(acc_logist1) <- "acc"
acc_logist1$group <- "logist"
acc_logist1$group[101:200] <- "EN + logist"
acc_logist1$group[201:300] <- "iTwiner + logist"
acc_logist1<- acc_logist1 %>% mutate_if(is.character,factor)
acc_logist1$group <- ordered(acc_logist1$group, levels = c("logist", "EN + logist","iTwiner + logist"))

acc_logist1$dataset <- "DATASET1"






rf1 <- acc[5,]
rf_en1 <- acc_enplus[5,]
rf_iTwiner1 <- acc_tcoxplus[5,]

acc_rf1 <- as.data.frame(c(rf1,rf_en1, rf_iTwiner1
                             #,rf_hub
))
colnames(acc_rf1) <- "acc"
acc_rf1$group <- "rf"
acc_rf1$group[101:200] <- "EN + rf"
acc_rf1$group[201:300] <- "iTwiner + rf"
acc_rf1<- acc_rf1 %>% mutate_if(is.character,factor)
acc_rf1$group <- ordered(acc_rf1$group, levels = c("rf", "EN + rf","iTwiner + rf"))

acc_rf1$dataset <- "DATASET1"

3.2 DATASET 2

3.2.1 DATA

load("~/CRC_LCosta/results/results2_2010_final.RData")
DATASET1_bal <- df2
DATASET1 <- DATASET2_bal
clinic1 <- clinic2_bal
datasurv1 <- datasurv2
clinic1_bal <- clinic2_bal
  • Split dataset in two groups for classification: – P -> non-metastatic (CLASS = 1) – PM -> metastatic (CLASS = 0)
# primM <- DATASET1 %>%
#   filter(str_detect(class, "m"))
# rownames(primM) <- primM$ID
# primN <- DATASET1 %>%
#   filter(!str_detect(class, "Pm"))
# rownames(primN) <- primN$ID
# 
# clinic_prim <- DATASET1
# 
# rnaprimM <- rnaseq1[rownames(rnaseq1) %in% 
#                          rownames(primM),]
# 
# rnaprimN <- rnaseq1[rownames(rnaseq1) %in% 
#                          rownames(primN),]
# 
# rnaprim <- rbind(rnaprimM,rnaprimN)
  • removing variables with sd=0
# xmet <- rnaprimM [,sapply(seq(ncol(rnaprimM)), function(ix) {sd(rnaprimM[,ix])}) != 0] 
# xnon <- rnaprimN[,sapply(seq(ncol(rnaprimN)), function(ix) {sd(rnaprimN[,ix])}) != 0] 
# 
# xmet_less <- xmet[,which(colnames(xmet) %in% colnames(xnon))]
# xnon_less <- xnon[,which(colnames(xnon) %in% colnames(xmet))]
# 
# # normalizing data
# xmet_norm <- scale(log2(xmet_less+1)) 
# xnon_norm <- scale(log2(xnon_less+1)) 
# 
# xdataT <- rbind(xmet_less,xnon_less)
# xdataT <- xdataT[ order(row.names(xdataT)), ]
# 
# 
# rm(xmet,xmet_less,xnon,xnon_less,rnaprimM,rnaprimN)
  • weight vector that penalizes genes with greater distances between Pm and P correlation matrices - TWINER
# #xmet_cor <- Matrix(cor(xmet_norm), sparse = TRUE)
# xmet_cor <- cor(xmet_norm)
# #xmet_cor <- as.data.frame(xmet_cor)
# xnon_cor <- cor(xnon_norm)
# #xnon_cor <- as.data.frame(xnon_cor)
# 
# # angular distance
# ang_weight <- vector()
# for (i in 1:dim(xmet_cor)[2]){ 
# ang_weight[i] <- acos(cosine(xmet_cor[,i],xnon_cor[,i]))/pi
# }
# 
# ## normalized weights
# 
# weights <- ang_weight / max(ang_weight)
# hist(weights,main="w")
# 
# 
# pen_weight2 <- 1 / weights
# hist(pen_weight2, main="1 / w")
# 
# rm(xmet_cor,xnon_cor)

3.2.2 Exploratory analysis

  • All dataset x
clinical <- as.data.frame(clinic1_bal)


basic_eda <- function(clinical)
{
  glimpse(clinical)
  #df_Status(clinical)
  freq(clinical) 
  profiling_num(clinical)
  plot_num(clinical)
  describe(clinic1)
}
basic_eda(clinical)
## Rows: 55
## Columns: 6
## $ class     <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", …
## $ organ     <chr> "colon", "colon", "rectum", "colon", "colon", "colon", "colo…
## $ Sex       <chr> "f", "m", "m", "m", "m", "f", "f", "m", "f", "m", "m", "f", …
## $ Age       <dbl> 73, 59, 59, 81, 78, 75, 61, 73, 36, 70, 72, 71, 75, 78, 60, …
## $ Stage     <chr> "II", "II", "II", "II", "III", "II", "II", "III", "II", "III…
## $ sidedness <chr> "right", "right", "rectum", "right", "left", "right", "left"…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1    Pm        30      54.55           54.55
## 2     P        25      45.45          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        49      89.09           89.09
## 2 rectum         6      10.91          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        30      54.55           54.55
## 2   m        25      45.45          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1   III        31      56.36           56.36
## 2    II        24      43.64          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1     right        23      41.82           41.82
## 2      left        21      38.18           80.00
## 3    rectum         6      10.91           90.91
## 4      <NA>         5       9.09          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinic1 
## 
##  6  Variables      55  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct 
##       55        0        2 
##                       
## Value          P    Pm
## Frequency     25    30
## Proportion 0.455 0.545
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       55        0        2 
##                         
## Value       colon rectum
## Frequency      49      6
## Proportion  0.891  0.109
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       55        0        2 
##                       
## Value          f     m
## Frequency     30    25
## Proportion 0.545 0.455
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       47        8       26    0.998    67.77    13.33     42.5     56.6 
##      .25      .50      .75      .90      .95 
##     59.5     70.0     75.0     81.0     85.0 
## 
## lowest : 36 37 38 53 56, highest: 78 81 85 86 87
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       55        0        2 
##                       
## Value         II   III
## Frequency     24    31
## Proportion 0.436 0.564
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       50        5        3 
##                                
## Value        left rectum  right
## Frequency      21      6     23
## Proportion   0.42   0.12   0.46
## --------------------------------------------------------------------------------
a <- na.omit(clinical$Age)
mean(a)
## [1] 67.76596
  • Only patients that do not metastasize
clinical_p <- clinic1_bal %>%
  filter(!str_detect(class, "Pm"))


basic_eda <- function(clinical_p)
{
  glimpse(clinical_p)
  #df_Status(clinical_p)
  freq(clinical_p) 
  profiling_num(clinical_p)
  plot_num(clinical_p)
  describe(clinical_p)
}
basic_eda(clinical_p)
## Rows: 25
## Columns: 6
## $ class     <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", …
## $ organ     <chr> "colon", "colon", "rectum", "colon", "colon", "colon", "colo…
## $ Sex       <chr> "f", "m", "m", "m", "m", "f", "f", "m", "f", "m", "m", "f", …
## $ Age       <dbl> 73, 59, 59, 81, 78, 75, 61, 73, 36, 70, 72, 71, 75, 78, 60, …
## $ Stage     <chr> "II", "II", "II", "II", "III", "II", "II", "III", "II", "III…
## $ sidedness <chr> "right", "right", "rectum", "right", "left", "right", "left"…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1     P        25        100             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        24         96              96
## 2 rectum         1          4             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        13         52              52
## 2   m        12         48             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1    II        14         56              56
## 2   III        11         44             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1     right        14         56              56
## 2      left         8         32              88
## 3      <NA>         2          8              96
## 4    rectum         1          4             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinical_p 
## 
##  6  Variables      25  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct    value 
##       25        0        1        P 
##              
## Value       P
## Frequency  25
## Proportion  1
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       25        0        2 
##                         
## Value       colon rectum
## Frequency      24      1
## Proportion   0.96   0.04
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       25        0        2 
##                     
## Value         f    m
## Frequency    13   12
## Proportion 0.52 0.48
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       21        4       14    0.993    66.52    12.83       38       57 
##      .25      .50      .75      .90      .95 
##       60       71       75       78       78 
## 
## lowest : 36 38 57 59 60, highest: 72 73 75 78 81
##                                                                             
## Value         36    38    57    59    60    61    63    70    71    72    73
## Frequency      1     1     1     2     1     1     1     2     1     1     3
## Proportion 0.048 0.048 0.048 0.095 0.048 0.048 0.048 0.095 0.048 0.048 0.143
##                             
## Value         75    78    81
## Frequency      3     2     1
## Proportion 0.143 0.095 0.048
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       25        0        2 
##                     
## Value        II  III
## Frequency    14   11
## Proportion 0.56 0.44
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       23        2        3 
##                                
## Value        left rectum  right
## Frequency       8      1     14
## Proportion  0.348  0.043  0.609
## --------------------------------------------------------------------------------
a <- na.omit(clinical_p$Age)
mean(a)
## [1] 66.52381
  • Only patients that do metastasize
clinical_Pm <- clinic1_bal%>%
  filter(str_detect(class, "Pm"))


basic_eda <- function(clinical_Pm)
{
  glimpse(clinical_Pm)
  #df_Status(clinical_Pm)
  freq(clinical_Pm) 
  profiling_num(clinical_Pm)
  plot_num(clinical_Pm)
  describe(clinical_Pm)
}
basic_eda(clinical_Pm)
## Rows: 30
## Columns: 6
## $ class     <chr> "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", …
## $ organ     <chr> "rectum", "colon", "rectum", "colon", "colon", "colon", "rec…
## $ Sex       <chr> "f", "f", "f", "m", "f", "f", "m", "f", "f", "m", "m", "f", …
## $ Age       <dbl> 58, 85, 62, 67, 75, 69, 58, 57, 72, 65, 61, 78, 81, 57, 74, …
## $ Stage     <chr> "II", "II", "III", "III", "III", "II", "II", "III", "II", "I…
## $ sidedness <chr> "left", "right", "rectum", "rectum", "left", "left", "rectum…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1    Pm        30        100             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        25      83.33           83.33
## 2 rectum         5      16.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        17      56.67           56.67
## 2   m        13      43.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1   III        20      66.67           66.67
## 2    II        10      33.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1      left        13      43.33           43.33
## 2     right         9      30.00           73.33
## 3    rectum         5      16.67           90.00
## 4      <NA>         3      10.00          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinical_Pm 
## 
##  6  Variables      30  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct    value 
##       30        0        1       Pm 
##              
## Value      Pm
## Frequency  30
## Proportion  1
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       30        0        2 
##                         
## Value       colon rectum
## Frequency      25      5
## Proportion  0.833  0.167
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       30        0        2 
##                       
## Value          f     m
## Frequency     17    13
## Proportion 0.567 0.433
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        4       18    0.997    68.77    13.91    53.75    56.50 
##      .25      .50      .75      .90      .95 
##    58.75    69.00    77.25    85.00    85.75 
## 
## lowest : 37 53 56 57 58, highest: 78 81 85 86 87
##                                                                             
## Value         37    53    56    57    58    61    62    65    67    69    72
## Frequency      1     1     1     2     2     1     1     1     1     3     2
## Proportion 0.038 0.038 0.038 0.077 0.077 0.038 0.038 0.038 0.038 0.115 0.077
##                                                     
## Value         74    75    78    81    85    86    87
## Frequency      2     1     1     2     2     1     1
## Proportion 0.077 0.038 0.038 0.077 0.077 0.038 0.038
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       30        0        2 
##                       
## Value         II   III
## Frequency     10    20
## Proportion 0.333 0.667
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       27        3        3 
##                                
## Value        left rectum  right
## Frequency      13      5      9
## Proportion  0.481  0.185  0.333
## --------------------------------------------------------------------------------
a <- na.omit(clinical_Pm$Age)
mean(a)
## [1] 68.76923
  • Statistic differences between patients groups regarding several variables
clinical_factor <- clinic1_bal
clinical_factor <- clinical_factor %>%
  mutate_if(sapply(clinical_factor, is.character), as.factor)
#clinical_factor <- na.omit(clinical_factor)

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  organ), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  Sex), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

clinical_factor1 <- clinical_factor %>%
  filter(!str_detect(sidedness, "rectum"))

ggplot(data = clinical_factor1) + 
  geom_bar(mapping = aes(x = class, fill =  sidedness), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  Stage), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4", "#000080"))

clinical_factor %>%
  ggplot( aes(x=Age, fill=class)) +
    geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
    scale_fill_manual(values=c("#69b3a2", "#404080")) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing non-finite values (stat_bin).

    labs(fill="")
## $fill
## [1] ""
## 
## attr(,"class")
## [1] "labels"
stat_data_organ <- table(clinical_factor$class,clinical_factor$organ)
#fazer plot(...)
fisher.test(stat_data_organ)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_organ
## p-value = 0.2045
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    0.4735918 236.0073881
## sample estimates:
## odds ratio 
##   4.683156
stat_data_sex <- table(clinical_factor$class,clinical_factor$Sex)
fisher.test(stat_data_sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_sex
## p-value = 0.7899
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2504095 2.7417911
## sample estimates:
## odds ratio 
##  0.8312827
stat_data_stage <- table(clinical_factor$class,clinical_factor$Stage)
fisher.test(stat_data_stage)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_stage
## p-value = 0.109
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7476062 8.7792316
## sample estimates:
## odds ratio 
##   2.500865
stat_data_side <- table(clinical_factor$class,clinical_factor$sidedness)
fisher.test(stat_data_side)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_side
## p-value = 0.1191
## alternative hypothesis: two.sided
hist(clinical_factor$Age[clinical_factor$class=="P"])

hist(clinical_factor$Age[clinical_factor$class=="Pm"])

tapply(clinical_factor$Age,clinical_factor$class, summary)
## $P
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   36.00   60.00   71.00   66.52   75.00   81.00       4 
## 
## $Pm
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   37.00   58.75   69.00   68.77   77.25   87.00       4
t.test(Age ~ class, clinical_factor)
## 
##  Welch Two Sample t-test
## 
## data:  Age by class
## t = -0.63093, df = 43.074, p-value = 0.5314
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.422239  4.931396
## sample estimates:
##  mean in group P mean in group Pm 
##         66.52381         68.76923

3.2.3 Survival analysis

3.2.3.1 Stage

data <- merge(datasurv1, clinical, by="row.names")

fit <- survfit(Surv(time, Status) ~ Stage, data = data)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ Stage, data = data)
## 
##            n events median 0.95LCL 0.95UCL
## Stage=II  24     11   2963    2011      NA
## Stage=III 31     17   1157     797      NA
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ Stage, data = data)
## 
##                 Stage=II 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     24       1    0.958  0.0408        0.882        1.000
##   204     22       1    0.915  0.0577        0.808        1.000
##   400     21       1    0.871  0.0695        0.745        1.000
##  1357     20       1    0.828  0.0785        0.687        0.997
##  1428     19       1    0.784  0.0856        0.633        0.971
##  1635     16       1    0.735  0.0932        0.573        0.942
##  1726     15       1    0.686  0.0990        0.517        0.910
##  2011     13       1    0.633  0.1045        0.458        0.875
##  2545      9       1    0.563  0.1142        0.378        0.838
##  2963      7       1    0.483  0.1230        0.293        0.795
##  3044      6       1    0.402  0.1261        0.218        0.743
## 
##                 Stage=III 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   216     31       1    0.968  0.0317        0.908        1.000
##   386     30       1    0.935  0.0441        0.853        1.000
##   420     29       1    0.903  0.0531        0.805        1.000
##   500     28       1    0.871  0.0602        0.761        0.997
##   518     27       1    0.839  0.0661        0.719        0.979
##   520     26       1    0.806  0.0710        0.679        0.958
##   573     25       1    0.774  0.0751        0.640        0.936
##   632     24       1    0.742  0.0786        0.603        0.913
##   665     23       1    0.710  0.0815        0.567        0.889
##   682     22       1    0.677  0.0840        0.531        0.864
##   797     21       1    0.645  0.0859        0.497        0.838
##   807     20       1    0.613  0.0875        0.463        0.811
##   857     18       1    0.579  0.0890        0.428        0.782
##  1107     14       1    0.538  0.0917        0.385        0.751
##  1157     13       1    0.496  0.0935        0.343        0.718
##  1682     10       1    0.447  0.0965        0.292        0.682
##  1835      9       1    0.397  0.0977        0.245        0.643
# Access to the sort summary table
summary(fit)$table
##           records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## Stage=II       24    24      24     11 2774.640   310.5367   2963    2011
## Stage=III      31    31      31     17 2179.046   321.5120   1157     797
##           0.95UCL
## Stage=II       NA
## Stage=III      NA
ggsurvplot(fit, data = data, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ Stage, data = data)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ Stage, data = data)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## Stage=II  24       11     14.9      1.01      2.27
## Stage=III 31       17     13.1      1.15      2.27
## 
##  Chisq= 2.3  on 1 degrees of freedom, p= 0.1

3.2.3.2 Class - PM vs. P

fit <- survfit(Surv(time, Status) ~ class, data = data)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ class, data = data)
## 
##           n events median 0.95LCL 0.95UCL
## class=P  25      6   3044    2963      NA
## class=Pm 30     22   1107     682    2011
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ class, data = data)
## 
##                 class=P 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   204     24       1    0.958  0.0408       0.8816            1
##   665     23       1    0.917  0.0564       0.8125            1
##  1726     15       1    0.856  0.0791       0.7137            1
##  2545      7       1    0.733  0.1319       0.5154            1
##  2963      4       1    0.550  0.1871       0.2824            1
##  3044      2       1    0.275  0.2158       0.0591            1
## 
##                 class=Pm 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     30       1    0.967  0.0328        0.905        1.000
##   216     29       1    0.933  0.0455        0.848        1.000
##   386     28       1    0.900  0.0548        0.799        1.000
##   400     27       1    0.867  0.0621        0.753        0.997
##   420     26       1    0.833  0.0680        0.710        0.978
##   500     25       1    0.800  0.0730        0.669        0.957
##   518     24       1    0.767  0.0772        0.629        0.934
##   520     23       1    0.733  0.0807        0.591        0.910
##   573     22       1    0.700  0.0837        0.554        0.885
##   632     21       1    0.667  0.0861        0.518        0.859
##   682     20       1    0.633  0.0880        0.482        0.832
##   797     19       1    0.600  0.0894        0.448        0.804
##   807     18       1    0.567  0.0905        0.414        0.775
##   857     17       1    0.533  0.0911        0.382        0.745
##  1107     15       1    0.498  0.0917        0.347        0.714
##  1157     14       1    0.462  0.0918        0.313        0.682
##  1357     13       1    0.427  0.0913        0.280        0.649
##  1428     12       1    0.391  0.0904        0.249        0.615
##  1635     11       1    0.356  0.0889        0.218        0.580
##  1682     10       1    0.320  0.0868        0.188        0.545
##  1835      9       1    0.284  0.0841        0.159        0.508
##  2011      7       1    0.244  0.0813        0.127        0.469
# Access to the sort summary table
summary(fit)$table
##          records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## class=P       25    25      25      6 2986.683   324.5666   3044    2963
## class=Pm      30    30      30     22 1716.567   276.5550   1107     682
##          0.95UCL
## class=P       NA
## class=Pm    2011
ggsurvplot(fit, data = data, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ class, data = data)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ class, data = data)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## class=P  25        6     14.5      4.96      10.5
## class=Pm 30       22     13.5      5.30      10.5
## 
##  Chisq= 10.5  on 1 degrees of freedom, p= 0.001

3.2.3.3 Sidedness

newdata <- data[-which(data$sidedness == "rectum"),]
fit <- survfit(Surv(time, Status) ~ sidedness, data = newdata)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
##    5 observations deleted due to missingness 
##                  n events median 0.95LCL 0.95UCL
## sidedness=left  21      9   3044    1428      NA
## sidedness=right 23     12   2011    1157      NA
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
## 5 observations deleted due to missingness 
##                 sidedness=left 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   400     21       1    0.952  0.0465        0.866        1.000
##   520     20       1    0.905  0.0641        0.788        1.000
##   632     19       1    0.857  0.0764        0.720        1.000
##   682     18       1    0.810  0.0857        0.658        0.996
##   807     17       1    0.762  0.0929        0.600        0.968
##  1107     14       1    0.707  0.1010        0.535        0.936
##  1428     12       1    0.649  0.1084        0.467        0.900
##  1835     11       1    0.590  0.1135        0.404        0.860
##  3044      4       1    0.442  0.1534        0.224        0.873
## 
##                 sidedness=right 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     23       1    0.957  0.0425        0.877        1.000
##   204     21       1    0.911  0.0601        0.800        1.000
##   420     20       1    0.865  0.0723        0.735        1.000
##   500     19       1    0.820  0.0816        0.675        0.997
##   665     18       1    0.774  0.0889        0.618        0.970
##   797     17       1    0.729  0.0946        0.565        0.940
##   857     15       1    0.680  0.1000        0.510        0.907
##  1157     13       1    0.628  0.1051        0.452        0.872
##  1635     10       1    0.565  0.1118        0.383        0.833
##  1726      9       1    0.502  0.1157        0.320        0.789
##  2011      8       1    0.440  0.1170        0.261        0.741
##  2545      4       1    0.330  0.1294        0.153        0.712
# Access to the sort summary table
summary(fit)$table
##                 records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## sidedness=left       21    21      21      9 2701.025   365.2286   3044    1428
## sidedness=right      23    23      23     12 2227.222   362.8170   2011    1157
##                 0.95UCL
## sidedness=left       NA
## sidedness=right      NA
ggsurvplot(fit, data = newdata, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ sidedness, data = newdata)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
## n=44, 5 observations deleted due to missingness.
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## sidedness=left  21        9    11.06     0.382     0.812
## sidedness=right 23       12     9.94     0.425     0.812
## 
##  Chisq= 0.8  on 1 degrees of freedom, p= 0.4

3.2.4 DEGs

xdata <- rnaseq1[ order(row.names(rnaseq1)), ]
rownames(DATASET1_bal) <- DATASET1_bal$ID
ydata <- as.data.frame(DATASET1_bal[ order(row.names(DATASET1_bal)), ])
rownames(ydata) <- ydata$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata),]
ydata <- as.data.frame(ydata[rownames(ydata) %in% 
                         rownames(xdata),])

# keep features with standard deviation > 0
xdata <- xdata[,sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])}) != 0]
xdata <- t(xdata)

group <- as.factor(ydata$class)
class <- as.data.frame(ydata$class)
edgeR.DGElist <- DGEList(counts=xdata, group = group)

# remove genes that do not have one count per million in at least 5 samples
keep <- rowSums(cpm(edgeR.DGElist) >= 1) >= 5
edgeR.DGElist <- edgeR.DGElist[keep ,]

# specify the design setup 

design <- model.matrix(~group)

# estimate the dispersion for all read counts across all samples
edgeR.DGElist <- estimateDisp(edgeR.DGElist, design)

# fit the negative binomial model
edger_fit <- glmFit(edgeR.DGElist, design )

# perform the testing for every gene using the neg. binomial model
edger_lrt <- glmLRT(edger_fit)
summary(decideTests(edger_lrt))
##        groupPm
## Down      1005
## NotSig   18381
## Up         835
# extract results from edger _lrt$ table plus adjusted p- values
DGE.results_edgeR <- topTags(edger_lrt, n = Inf , sort.by = "PValue" , adjust.method = "BH" )

topTags(DGE.results_edgeR) #table with the top10 DEGs
## Coefficient:  groupPm 
##             logFC    logCPM       LR       PValue          FDR
## CHGA    -8.017340  6.906952 62.54124 2.609236e-15 5.276137e-11
## TPH1    -6.062430  4.908484 46.40563 9.613905e-12 9.720139e-08
## VWA5B2  -4.644305  1.961363 37.35393 9.852224e-10 6.640728e-06
## STXBP5L -5.284757  2.612217 35.97941 1.994132e-09 1.008084e-05
## EGFR    -3.291552  6.398804 34.90765 3.457204e-09 1.398162e-05
## IDO1    -3.202438  2.789425 33.54228 6.973165e-09 2.350073e-05
## PEG3    -4.262645  2.669096 33.23490 8.167131e-09 2.359251e-05
## MIR3978  4.038345 -1.040147 32.55673 1.157634e-08 2.926066e-05
## NLRP2   -2.397736  2.333213 32.04905 1.503287e-08 3.377553e-05
## PTPRN   -5.380491  3.370588 31.74708 1.756133e-08 3.551077e-05
genes_deg <- DGE.results_edgeR$table
genes_deg <- genes_deg[which(genes_deg$FDR < 0.05),]
dim(genes_deg)#genes found to be differentially expressed
## [1] 1840    5
genes_deg$row <- row.names(genes_deg)
write_xlsx(genes_deg,"genes_deg_d2.xlsx")

top100_deg <- rownames(genes_deg[1:100,])
# highly expressed genes
high <- genes_deg[genes_deg$logFC > 0,]
dim(high)
## [1] 835   6
top100_high <- rownames(high[1:100,])

#low expressed genes
low <- genes_deg[genes_deg$logFC < 0,]
dim(low)
## [1] 1005    6
top100_low <- rownames(low[1:100,])

3.2.5 Classification

  • In this work we tested 3 different types of classification: 1) Classification without regularization based on DEGs; 2) Classification with regularization (EN and iTwiner); 3) Classification without regularization based on genes selected by regularized logistic regression;
xdata.raw <- xdataT

# keep features with standard deviation > 0
xdata <- xdata.raw[,sapply(seq(ncol(xdata.raw)), function(ix) {sd(xdata.raw[,ix])}) != 0]

ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

# ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
ydata$`clinic1$class` <- as.numeric(ydata$`clinic1$class`)

3.2.5.1 1) Classification without regularization based on DEGs

Five classifiers were used: Decision trees, linear and radial support vector machines, logistic regression and random forest

  • Lets pick up the 50 deferentially expressed genes with lowest pvalue found above
xdata <- xdataT[,top100_deg[1:50]]

nomesgenes <- colnames(xdata)
colnames(xdata) <- paste0("Var", 1:50)
colnames(ydata) <- c("class","row")
xdata$type <- as.factor(ydata$class)

#xdata <- xdata[colMeans(xdata == 0) <= 0.6] #delete genes that have null values in at least 60% of the samples
  • Test the different classifiers 100 times to obtain median values for the measures of model performance such as accuracy, misclassification, sensitivity and specificity, among others
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.2.5.1.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_train[1,]
hist(acc_trees)

mean(acc_trees)
## [1] 0.9844737
median(acc_trees)
## [1] 0.9736842
sd(acc_trees)
## [1] 0.01634458
acc_svm <- acc_train[2,]
hist(acc_svm)

mean(acc_svm)
## [1] 0.9318421
median(acc_svm)
## [1] 0.9473684
sd(acc_svm)
## [1] 0.03852665
acc_svmR <- acc_train[3,]
hist(acc_svmR)

mean(acc_svmR)
## [1] 0.9344737
median(acc_svmR)
## [1] 0.9736842
sd(acc_svmR)
## [1] 0.1094652
acc_logs <- acc_train[4,]
hist(acc_logs)

mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_train[5,]
hist(acc_rf)

mean(acc_rf)
## [1] 1
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0
  • auc
# auc
auc_trees <- auc_train[1,]
mean(auc_trees)
## [1] 0.9830392
median(auc_trees)
## [1] 0.9761905
sd(auc_trees)
## [1] 0.0177827
auc_svm <- auc_train[2,]
mean(auc_svm)
## [1] 0.9295938
median(auc_svm)
## [1] 0.9439776
sd(auc_svm)
## [1] 0.04390672
auc_svmR <- auc_train[3,]
mean(auc_svmR)
## [1] 0.9308543
median(auc_svmR)
## [1] 0.9733894
sd(auc_svmR)
## [1] 0.122283
auc_logs <- auc_train[4,]
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_train[5,]
mean(auc_rf)
## [1] 1
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0
  • Misclassification
# miscl
miscl_trees <- miscl_train[1,]
mean(miscl_trees)
## [1] 0.59
median(miscl_trees)
## [1] 1
sd(miscl_trees)
## [1] 0.6210939
miscl_svm <- miscl_train[2,]
mean(miscl_svm)
## [1] 2.59
median(miscl_svm)
## [1] 2
sd(miscl_svm)
## [1] 1.464013
miscl_svmR <- miscl_train[3,]
mean(miscl_svmR)
## [1] 2.49
median(miscl_svmR)
## [1] 1
sd(miscl_svmR)
## [1] 4.159679
miscl_logs <- miscl_train[4,]
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_train[5,]
mean(miscl_rf)
## [1] 0
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_train[1,]
mean(sensitivity_trees)
## [1] 0.9694118
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.03394119
sensitivity_svm <- sensitivity_train[2,]
mean(sensitivity_svm)
## [1] 0.9082353
median(sensitivity_svm)
## [1] 0.9411765
sd(sensitivity_svm)
## [1] 0.109753
sensitivity_svmR <- sensitivity_train[3,]
mean(sensitivity_svmR)
## [1] 0.8964706
median(sensitivity_svmR)
## [1] 1
sd(sensitivity_svmR)
## [1] 0.2487422
sensitivity_logs <- sensitivity_train[4,]
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_train[5,]
mean(sensitivity_rf)
## [1] 1
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0
  • Specificity
# specificity
specificity_trees <- specificity_train[1,]
mean(specificity_trees)
## [1] 0.9966667
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.01221107
specificity_svm <- specificity_train[2,]
mean(specificity_svm)
## [1] 0.9509524
median(specificity_svm)
## [1] 0.952381
sd(specificity_svm)
## [1] 0.04994557
specificity_svmR <- specificity_train[3,]
mean(specificity_svmR)
## [1] 0.9652381
median(specificity_svmR)
## [1] 0.9761905
sd(specificity_svmR)
## [1] 0.04326141
specificity_logs <- specificity_train[4,]
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_train[5,]
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_train[1,]
mean(fneg_trees)
## [1] 0.52
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.5770003
fneg_svm <- fneg_train[2,]
mean(fneg_svm)
## [1] 1.56
median(fneg_svm)
## [1] 1
sd(fneg_svm)
## [1] 1.865801
fneg_svmR <- fneg_train[3,]
mean(fneg_svmR)
## [1] 1.76
median(fneg_svmR)
## [1] 0
sd(fneg_svmR)
## [1] 4.228618
fneg_logs <- fneg_train[4,]
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_train[5,]
mean(fneg_rf)
## [1] 0
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0
3.2.5.1.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc[1,]
hist(acc_trees)

mean(acc_trees)
## [1] 0.6629412
median(acc_trees)
## [1] 0.6470588
sd(acc_trees)
## [1] 0.10269
acc_svm <- acc[2,]
hist(acc_svm)

mean(acc_svm)
## [1] 0.7264706
median(acc_svm)
## [1] 0.7058824
sd(acc_svm)
## [1] 0.09211134
acc_svmR <- acc[3,]
hist(acc_svmR)

mean(acc_svmR)
## [1] 0.62
median(acc_svmR)
## [1] 0.5882353
sd(acc_svmR)
## [1] 0.1123029
acc_logs <- acc[4,]
hist(acc_logs)

mean(acc_logs)
## [1] 0.6564706
median(acc_logs)
## [1] 0.6470588
sd(acc_logs)
## [1] 0.08473757
acc_rf <- acc[5,]
hist(acc_rf)

mean(acc_rf)
## [1] 0.7288235
median(acc_rf)
## [1] 0.7058824
sd(acc_rf)
## [1] 0.08961878
  • auc
# auc
auc_trees <- auc[1,]
mean(auc_trees)
## [1] 0.6609028
median(auc_trees)
## [1] 0.6527778
sd(auc_trees)
## [1] 0.1016463
auc_svm <- auc[2,]
mean(auc_svm)
## [1] 0.7238889
median(auc_svm)
## [1] 0.7083333
sd(auc_svm)
## [1] 0.08707307
auc_svmR <- auc[3,]
mean(auc_svmR)
## [1] 0.6244444
median(auc_svmR)
## [1] 0.6145833
sd(auc_svmR)
## [1] 0.1137441
auc_logs <- auc[4,]
mean(auc_logs)
## [1] 0.6536111
median(auc_logs)
## [1] 0.6388889
sd(auc_logs)
## [1] 0.08192007
auc_rf <- auc[5,]
mean(auc_rf)
## [1] 0.7268056
median(auc_rf)
## [1] 0.7083333
sd(auc_rf)
## [1] 0.09063707
  • Misclassification
# miscl
miscl_trees <- miscl[1,]
mean(miscl_trees)
## [1] 5.73
median(miscl_trees)
## [1] 6
sd(miscl_trees)
## [1] 1.745731
miscl_svm <- miscl[2,]
mean(miscl_svm)
## [1] 4.65
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.565893
miscl_svmR <- miscl[3,]
mean(miscl_svmR)
## [1] 6.46
median(miscl_svmR)
## [1] 7
sd(miscl_svmR)
## [1] 1.909149
miscl_logs <- miscl[4,]
mean(miscl_logs)
## [1] 5.84
median(miscl_logs)
## [1] 6
sd(miscl_logs)
## [1] 1.440539
miscl_rf <- miscl[5,]
mean(miscl_rf)
## [1] 4.61
median(miscl_rf)
## [1] 5
sd(miscl_rf)
## [1] 1.523519
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity[1,]
mean(sensitivity_trees)
## [1] 0.59625
median(sensitivity_trees)
## [1] 0.625
sd(sensitivity_trees)
## [1] 0.1862161
sensitivity_svm <- sensitivity[2,]
mean(sensitivity_svm)
## [1] 0.6275
median(sensitivity_svm)
## [1] 0.625
sd(sensitivity_svm)
## [1] 0.1986221
sensitivity_svmR <- sensitivity[3,]
mean(sensitivity_svmR)
## [1] 0.625
median(sensitivity_svmR)
## [1] 0.75
sd(sensitivity_svmR)
## [1] 0.3207866
sensitivity_logs <- sensitivity[4,]
mean(sensitivity_logs)
## [1] 0.565
median(sensitivity_logs)
## [1] 0.625
sd(sensitivity_logs)
## [1] 0.1623299
sensitivity_rf <- sensitivity[5,]
mean(sensitivity_rf)
## [1] 0.6925
median(sensitivity_rf)
## [1] 0.75
sd(sensitivity_rf)
## [1] 0.1661165
  • Specificity
# specificity
specificity_trees <- specificity[1,]
mean(specificity_trees)
## [1] 0.7222222
median(specificity_trees)
## [1] 0.7777778
sd(specificity_trees)
## [1] 0.1468801
specificity_svm <- specificity[2,]
mean(specificity_svm)
## [1] 0.8144444
median(specificity_svm)
## [1] 0.7777778
sd(specificity_svm)
## [1] 0.151523
specificity_svmR <- specificity[3,]
mean(specificity_svmR)
## [1] 0.6155556
median(specificity_svmR)
## [1] 0.5555556
sd(specificity_svmR)
## [1] 0.2224578
specificity_logs <- specificity[4,]
mean(specificity_logs)
## [1] 0.7377778
median(specificity_logs)
## [1] 0.7777778
sd(specificity_logs)
## [1] 0.131677
specificity_rf <- specificity[5,]
mean(specificity_rf)
## [1] 0.7611111
median(specificity_rf)
## [1] 0.7777778
sd(specificity_rf)
## [1] 0.1398117
  • False Neg
# fneg
fneg_trees <- fneg[1,]
mean(fneg_trees)
## [1] 3.23
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.489729
fneg_svm <- fneg[2,]
mean(fneg_svm)
## [1] 2.98
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 1.588977
fneg_svmR <- fneg[3,]
mean(fneg_svmR)
## [1] 3
median(fneg_svmR)
## [1] 2
sd(fneg_svmR)
## [1] 2.566293
fneg_logs <- fneg[4,]
mean(fneg_logs)
## [1] 3.48
median(fneg_logs)
## [1] 3
sd(fneg_logs)
## [1] 1.29864
fneg_rf <- fneg[5,]
mean(fneg_rf)
## [1] 2.46
median(fneg_rf)
## [1] 2
sd(fneg_rf)
## [1] 1.328932

3.2.5.2 2) Classification with regularization (EN and iTwiner)

xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID
# 
# # ydata.raw$class <- c(rep(0,28),rep(1,34))
#  
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw)
ydata$`clinic1$class` <- as.numeric(ydata$`clinic1$class`)
## [1] 0
## [1] 1
## [1] 1011
## [1] "data"
## [1] "EN"
## [1] 1
## [1] 2
## [1] 1022
## [1] "data"
## [1] "EN"
## [1] 2
## [1] 3
## [1] 1033
## [1] "data"
## [1] "EN"
## [1] 3
## [1] 4
## [1] 1044
## [1] "data"
## [1] "EN"
## [1] 4
## [1] 5
## [1] 1055
## [1] "data"
## [1] "EN"
## [1] 5
## [1] 6
## [1] 1066
## [1] "data"
## [1] "EN"
## [1] 6
## [1] 7
## [1] 1077
## [1] "data"
## [1] "EN"
## [1] 7
## [1] 8
## [1] 1088
## [1] "data"
## [1] "EN"
## [1] 8
## [1] 9
## [1] 1099
## [1] "data"
## [1] "EN"
## [1] 9
## [1] 10
## [1] 1110
## [1] "data"
## [1] "EN"
## [1] 10
## [1] 11
## [1] 1121
## [1] "data"
## [1] "EN"
## [1] 11
## [1] 12
## [1] 1132
## [1] "data"
## [1] "EN"
## [1] 12
## [1] 13
## [1] 1143
## [1] "data"
## [1] "EN"
## [1] 13
## [1] 14
## [1] 1154
## [1] "data"
## [1] "EN"
## [1] 14
## [1] 15
## [1] 1165
## [1] "data"
## [1] "EN"
## [1] 15
## [1] 16
## [1] 1176
## [1] "data"
## [1] "EN"
## [1] 16
## [1] 17
## [1] 1187
## [1] "data"
## [1] "EN"
## [1] 17
## [1] 18
## [1] 1198
## [1] "data"
## [1] "EN"
## [1] 18
## [1] 19
## [1] 1209
## [1] "data"
## [1] "EN"
## [1] 19
## [1] 20
## [1] 1220
## [1] "data"
## [1] "EN"
## [1] 20
## [1] 21
## [1] 1231
## [1] "data"
## [1] "EN"
## [1] 21
## [1] 22
## [1] 1242
## [1] "data"
## [1] "EN"
## [1] 22
## [1] 23
## [1] 1253
## [1] "data"
## [1] "EN"
## [1] 23
## [1] 24
## [1] 1264
## [1] "data"
## [1] "EN"
## [1] 24
## [1] 25
## [1] 1275
## [1] "data"
## [1] "EN"
## [1] 25
## [1] 26
## [1] 1286
## [1] "data"
## [1] "EN"
## [1] 26
## [1] 27
## [1] 1297
## [1] "data"
## [1] "EN"
## [1] 27
## [1] 28
## [1] 1308
## [1] "data"
## [1] "EN"
## [1] 28
## [1] 29
## [1] 1319
## [1] "data"
## [1] "EN"
## [1] 29
## [1] 30
## [1] 1330
## [1] "data"
## [1] "EN"
## [1] 30
## [1] 31
## [1] 1341
## [1] "data"
## [1] "EN"
## [1] 31
## [1] 32
## [1] 1352
## [1] "data"
## [1] "EN"
## [1] 32
## [1] 33
## [1] 1363
## [1] "data"
## [1] "EN"
## [1] 33
## [1] 34
## [1] 1374
## [1] "data"
## [1] "EN"
## [1] 34
## [1] 35
## [1] 1385
## [1] "data"
## [1] "EN"
## [1] 35
## [1] 36
## [1] 1396
## [1] "data"
## [1] "EN"
## [1] 36
## [1] 37
## [1] 1407
## [1] "data"
## [1] "EN"
## [1] 37
## [1] 38
## [1] 1418
## [1] "data"
## [1] "EN"
## [1] 38
## [1] 39
## [1] 1429
## [1] "data"
## [1] "EN"
## [1] 39
## [1] 40
## [1] 1440
## [1] "data"
## [1] "EN"
## [1] 40
## [1] 41
## [1] 1451
## [1] "data"
## [1] "EN"
## [1] 41
## [1] 42
## [1] 1462
## [1] "data"
## [1] "EN"
## [1] 42
## [1] 43
## [1] 1473
## [1] "data"
## [1] "EN"
## [1] 43
## [1] 44
## [1] 1484
## [1] "data"
## [1] "EN"
## [1] 44
## [1] 45
## [1] 1495
## [1] "data"
## [1] "EN"
## [1] 45
## [1] 46
## [1] 1506
## [1] "data"
## [1] "EN"
## [1] 46
## [1] 47
## [1] 1517
## [1] "data"
## [1] "EN"
## [1] 47
## [1] 48
## [1] 1528
## [1] "data"
## [1] "EN"
## [1] 48
## [1] 49
## [1] 1539
## [1] "data"
## [1] "EN"
## [1] 49
## [1] 50
## [1] 1550
## [1] "data"
## [1] "EN"
## [1] 50
## [1] 51
## [1] 1561
## [1] "data"
## [1] "EN"
## [1] 51
## [1] 52
## [1] 1572
## [1] "data"
## [1] "EN"
## [1] 52
## [1] 53
## [1] 1583
## [1] "data"
## [1] "EN"
## [1] 53
## [1] 54
## [1] 1594
## [1] "data"
## [1] "EN"
## [1] 54
## [1] 55
## [1] 1605
## [1] "data"
## [1] "en train ups"
## [1] "EN"
## [1] 54
## [1] 56
## [1] 1616
## [1] "data"
## [1] "EN"
## [1] 55
## [1] 57
## [1] 1627
## [1] "data"
## [1] "EN"
## [1] 56
## [1] 58
## [1] 1638
## [1] "data"
## [1] "EN"
## [1] 57
## [1] 59
## [1] 1649
## [1] "data"
## [1] "EN"
## [1] 58
## [1] 60
## [1] 1660
## [1] "data"
## [1] "EN"
## [1] 59
## [1] 61
## [1] 1671
## [1] "data"
## [1] "EN"
## [1] 60
## [1] 62
## [1] 1682
## [1] "data"
## [1] "EN"
## [1] 61
## [1] 63
## [1] 1693
## [1] "data"
## [1] "EN"
## [1] 62
## [1] 64
## [1] 1704
## [1] "data"
## [1] "EN"
## [1] 63
## [1] 65
## [1] 1715
## [1] "data"
## [1] "EN"
## [1] 64
## [1] 66
## [1] 1726
## [1] "data"
## [1] "EN"
## [1] 65
## [1] 67
## [1] 1737
## [1] "data"
## [1] "EN"
## [1] 66
## [1] 68
## [1] 1748
## [1] "data"
## [1] "EN"
## [1] 67
## [1] 69
## [1] 1759
## [1] "data"
## [1] "EN"
## [1] 68
## [1] 70
## [1] 1770
## [1] "data"
## [1] "en train ups"
## [1] "en pred ups"
## [1] "EN"
## [1] 68
## [1] 71
## [1] 1781
## [1] "data"
## [1] "EN"
## [1] 69
## [1] 72
## [1] 1792
## [1] "data"
## [1] "EN"
## [1] 70
## [1] 73
## [1] 1803
## [1] "data"
## [1] "EN"
## [1] 71
## [1] 74
## [1] 1814
## [1] "data"
## [1] "EN"
## [1] 72
## [1] 75
## [1] 1825
## [1] "data"
## [1] "EN"
## [1] 73
## [1] 76
## [1] 1836
## [1] "data"
## [1] "EN"
## [1] 74
## [1] 77
## [1] 1847
## [1] "data"
## [1] "EN"
## [1] 75
## [1] 78
## [1] 1858
## [1] "data"
## [1] "EN"
## [1] 76
## [1] 79
## [1] 1869
## [1] "data"
## [1] "EN"
## [1] 77
## [1] 80
## [1] 1880
## [1] "data"
## [1] "EN"
## [1] 78
## [1] 81
## [1] 1891
## [1] "data"
## [1] "EN"
## [1] 79
## [1] 82
## [1] 1902
## [1] "data"
## [1] "EN"
## [1] 80
## [1] 83
## [1] 1913
## [1] "data"
## [1] "EN"
## [1] 81
## [1] 84
## [1] 1924
## [1] "data"
## [1] "EN"
## [1] 82
## [1] 85
## [1] 1935
## [1] "data"
## [1] "EN"
## [1] 83
## [1] 86
## [1] 1946
## [1] "data"
## [1] "EN"
## [1] 84
## [1] 87
## [1] 1957
## [1] "data"
## [1] "EN"
## [1] 85
## [1] 88
## [1] 1968
## [1] "data"
## [1] "EN"
## [1] 86
## [1] 89
## [1] 1979
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 86
## [1] 90
## [1] 1990
## [1] "data"
## [1] "EN"
## [1] 87
## [1] 91
## [1] 2001
## [1] "data"
## [1] "EN"
## [1] 88
## [1] 92
## [1] 2012
## [1] "data"
## [1] "EN"
## [1] 89
## [1] 93
## [1] 2023
## [1] "data"
## [1] "EN"
## [1] 90
## [1] 94
## [1] 2034
## [1] "data"
## [1] "EN"
## [1] 91
## [1] 95
## [1] 2045
## [1] "data"
## [1] "EN"
## [1] 92
## [1] 96
## [1] 2056
## [1] "data"
## [1] "EN"
## [1] 93
## [1] 97
## [1] 2067
## [1] "data"
## [1] "EN"
## [1] 94
## [1] 98
## [1] 2078
## [1] "data"
## [1] "EN"
## [1] 95
## [1] 99
## [1] 2089
## [1] "data"
## [1] "EN"
## [1] 96
## [1] 100
## [1] 2100
## [1] "data"
## [1] "EN"
## [1] 97
## [1] 101
## [1] 2111
## [1] "data"
## [1] "EN"
## [1] 98
## [1] 102
## [1] 2122
## [1] "data"
## [1] "en train ups"
## [1] "en pred ups"
## [1] "EN"
## [1] 98
## [1] 103
## [1] 2133
## [1] "data"
## [1] "EN"
## [1] 99
## [1] 104
## [1] 2144
## [1] "data"
## [1] "en train ups"
## [1] "EN"
## [1] 99
## [1] 105
## [1] 2155
## [1] "data"
## [1] "EN"
## [1] 1
## [1] 0
## [1] 2166
## [1] "data"
## [1] "iTwiner"
## [1] 2
## [1] 1
## [1] 2177
## [1] "data"
## [1] "iTwiner"
## [1] 3
## [1] 2
## [1] 2188
## [1] "data"
## [1] "iTwiner"
## [1] 4
## [1] 3
## [1] 2199
## [1] "data"
## [1] "iTwiner"
## [1] 5
## [1] 4
## [1] 2210
## [1] "data"
## [1] "iTwiner"
## [1] 6
## [1] 5
## [1] 2221
## [1] "data"
## [1] "iTwiner"
## [1] 7
## [1] 6
## [1] 2232
## [1] "data"
## [1] "iTwiner"
## [1] 8
## [1] 7
## [1] 2243
## [1] "data"
## [1] "iTwiner"
## [1] 9
## [1] 8
## [1] 2254
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 10
## [1] 8
## [1] 2265
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 11
## [1] 8
## [1] 2276
## [1] "data"
## [1] "iTwiner"
## [1] 12
## [1] 9
## [1] 2287
## [1] "data"
## [1] "iTwiner"
## [1] 13
## [1] 10
## [1] 2298
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 14
## [1] 10
## [1] 2309
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 15
## [1] 10
## [1] 2320
## [1] "data"
## [1] "iTwiner"
## [1] 16
## [1] 11
## [1] 2331
## [1] "data"
## [1] "iTwiner"
## [1] 17
## [1] 12
## [1] 2342
## [1] "data"
## [1] "iTwiner"
## [1] 18
## [1] 13
## [1] 2353
## [1] "data"
## [1] "iTwiner"
## [1] 19
## [1] 14
## [1] 2364
## [1] "data"
## [1] "iTwiner"
## [1] 20
## [1] 15
## [1] 2375
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 21
## [1] 15
## [1] 2386
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 22
## [1] 15
## [1] 2397
## [1] "data"
## [1] "iTwiner"
## [1] 23
## [1] 16
## [1] 2408
## [1] "data"
## [1] "iTwiner"
## [1] 24
## [1] 17
## [1] 2419
## [1] "data"
## [1] "iTwiner"
## [1] 25
## [1] 18
## [1] 2430
## [1] "data"
## [1] "iTwiner"
## [1] 26
## [1] 19
## [1] 2441
## [1] "data"
## [1] "iTwiner"
## [1] 27
## [1] 20
## [1] 2452
## [1] "data"
## [1] "iTwiner"
## [1] 28
## [1] 21
## [1] 2463
## [1] "data"
## [1] "iTwiner"
## [1] 29
## [1] 22
## [1] 2474
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 30
## [1] 23
## [1] 2485
## [1] "data"
## [1] "iTwiner"
## [1] 31
## [1] 24
## [1] 2496
## [1] "data"
## [1] "iTwiner"
## [1] 32
## [1] 25
## [1] 2507
## [1] "data"
## [1] "iTwiner"
## [1] 33
## [1] 26
## [1] 2518
## [1] "data"
## [1] "iTwiner"
## [1] 34
## [1] 27
## [1] 2529
## [1] "data"
## [1] "iTwiner"
## [1] 35
## [1] 28
## [1] 2540
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 36
## [1] 28
## [1] 2551
## [1] "data"
## [1] "iTwiner"
## [1] 37
## [1] 29
## [1] 2562
## [1] "data"
## [1] "iTwiner"
## [1] 38
## [1] 30
## [1] 2573
## [1] "data"
## [1] "iTwiner"
## [1] 39
## [1] 31
## [1] 2584
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 40
## [1] 32
## [1] 2595
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 41
## [1] 32
## [1] 2606
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 42
## [1] 32
## [1] 2617
## [1] "data"
## [1] "iTwiner"
## [1] 43
## [1] 33
## [1] 2628
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 44
## [1] 33
## [1] 2639
## [1] "data"
## [1] "iTwiner"
## [1] 45
## [1] 34
## [1] 2650
## [1] "data"
## [1] "iTwiner"
## [1] 46
## [1] 35
## [1] 2661
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 47
## [1] 35
## [1] 2672
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 48
## [1] 36
## [1] 2683
## [1] "data"
## [1] "iTwiner"
## [1] 49
## [1] 37
## [1] 2694
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 50
## [1] 37
## [1] 2705
## [1] "data"
## [1] "iTwiner"
## [1] 51
## [1] 38
## [1] 2716
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 52
## [1] 39
## [1] 2727
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 53
## [1] 40
## [1] 2738
## [1] "data"
## [1] "iTwiner"
## [1] 54
## [1] 41
## [1] 2749
## [1] "data"
## [1] "iTwiner"
## [1] 55
## [1] 42
## [1] 2760
## [1] "data"
## [1] "iTwiner"
## [1] 56
## [1] 43
## [1] 2771
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 57
## [1] 44
## [1] 2782
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 58
## [1] 45
## [1] 2793
## [1] "data"
## [1] "iTwiner"
## [1] 59
## [1] 46
## [1] 2804
## [1] "data"
## [1] "iTwiner"
## [1] 60
## [1] 47
## [1] 2815
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 61
## [1] 47
## [1] 2826
## [1] "data"
## [1] "iTwiner"
## [1] 62
## [1] 48
## [1] 2837
## [1] "data"
## [1] "iTwiner"
## [1] 63
## [1] 49
## [1] 2848
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 64
## [1] 50
## [1] 2859
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 65
## [1] 50
## [1] 2870
## [1] "data"
## [1] "iTwiner"
## [1] 66
## [1] 51
## [1] 2881
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 67
## [1] 51
## [1] 2892
## [1] "data"
## [1] "iTwiner"
## [1] 68
## [1] 52
## [1] 2903
## [1] "data"
## [1] "iTwiner"
## [1] 69
## [1] 53
## [1] 2914
## [1] "data"
## [1] "iTwiner"
## [1] 70
## [1] 54
## [1] 2925
## [1] "data"
## [1] "iTwiner"
## [1] 71
## [1] 55
## [1] 2936
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 72
## [1] 55
## [1] 2947
## [1] "data"
## [1] "iTwiner"
## [1] 73
## [1] 56
## [1] 2958
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 74
## [1] 56
## [1] 2969
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 75
## [1] 56
## [1] 2980
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 76
## [1] 56
## [1] 2991
## [1] "data"
## [1] "iTwiner"
## [1] 77
## [1] 57
## [1] 3002
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 78
## [1] 57
## [1] 3013
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 79
## [1] 58
## [1] 3024
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 80
## [1] 58
## [1] 3035
## [1] "data"
## [1] "iTwiner"
## [1] 81
## [1] 59
## [1] 3046
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 82
## [1] 59
## [1] 3057
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 83
## [1] 59
## [1] 3068
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 84
## [1] 59
## [1] 3079
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 85
## [1] 59
## [1] 3090
## [1] "data"
## [1] "iTwiner não correu"
## [1] 86
## [1] 59
## [1] 3101
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 87
## [1] 59
## [1] 3112
## [1] "data"
## [1] "iTwiner"
## [1] 88
## [1] 60
## [1] 3123
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 89
## [1] 60
## [1] 3134
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 90
## [1] 60
## [1] 3145
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 91
## [1] 60
## [1] 3156
## [1] "data"
## [1] "iTwiner"
## [1] 92
## [1] 61
## [1] 3167
## [1] "data"
## [1] "iTwiner"
## [1] 93
## [1] 62
## [1] 3178
## [1] "data"
## [1] "iTwiner"
## [1] 94
## [1] 63
## [1] 3189
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 95
## [1] 63
## [1] 3200
## [1] "data"
## [1] "iTwiner"
## [1] 96
## [1] 64
## [1] 3211
## [1] "data"
## [1] "iTwiner"
## [1] 97
## [1] 65
## [1] 3222
## [1] "data"
## [1] "iTwiner"
## [1] 98
## [1] 66
## [1] 3233
## [1] "data"
## [1] "iTwiner"
## [1] 99
## [1] 67
## [1] 3244
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 100
## [1] 67
## [1] 3255
## [1] "data"
## [1] "iTwiner"
## [1] 101
## [1] 68
## [1] 3266
## [1] "data"
## [1] "iTwiner"
## [1] 102
## [1] 69
## [1] 3277
## [1] "data"
## [1] "iTwiner"
## [1] 103
## [1] 70
## [1] 3288
## [1] "data"
## [1] "iTwiner"
## [1] 104
## [1] 71
## [1] 3299
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 105
## [1] 72
## [1] 3310
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 106
## [1] 73
## [1] 3321
## [1] "data"
## [1] "iTwiner"
## [1] 107
## [1] 74
## [1] 3332
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 108
## [1] 74
## [1] 3343
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 109
## [1] 74
## [1] 3354
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 110
## [1] 74
## [1] 3365
## [1] "data"
## [1] "iTwiner"
## [1] 111
## [1] 75
## [1] 3376
## [1] "data"
## [1] "iTwiner"
## [1] 112
## [1] 76
## [1] 3387
## [1] "data"
## [1] "iTwiner"
## [1] 113
## [1] 77
## [1] 3398
## [1] "data"
## [1] "iTwiner"
## [1] 114
## [1] 78
## [1] 3409
## [1] "data"
## [1] "iTwiner"
## [1] 115
## [1] 79
## [1] 3420
## [1] "data"
## [1] "iTwiner"
## [1] 116
## [1] 80
## [1] 3431
## [1] "data"
## [1] "iTwiner"
## [1] 117
## [1] 81
## [1] 3442
## [1] "data"
## [1] "iTwiner"
## [1] 118
## [1] 82
## [1] 3453
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 119
## [1] 82
## [1] 3464
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 120
## [1] 82
## [1] 3475
## [1] "data"
## [1] "iTwiner"
## [1] 121
## [1] 83
## [1] 3486
## [1] "data"
## [1] "iTwiner"
## [1] 122
## [1] 84
## [1] 3497
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 123
## [1] 84
## [1] 3508
## [1] "data"
## [1] "iTwiner"
## [1] 124
## [1] 85
## [1] 3519
## [1] "data"
## [1] "iTwiner"
## [1] 125
## [1] 86
## [1] 3530
## [1] "data"
## [1] "iTwiner"
## [1] 126
## [1] 87
## [1] 3541
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 127
## [1] 87
## [1] 3552
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 128
## [1] 87
## [1] 3563
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 129
## [1] 87
## [1] 3574
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 130
## [1] 87
## [1] 3585
## [1] "data"
## [1] "iTwiner"
## [1] 131
## [1] 88
## [1] 3596
## [1] "data"
## [1] "iTwiner"
## [1] 132
## [1] 89
## [1] 3607
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 133
## [1] 90
## [1] 3618
## [1] "data"
## [1] "iTwiner"
## [1] 134
## [1] 91
## [1] 3629
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 135
## [1] 91
## [1] 3640
## [1] "data"
## [1] "iTwiner"
## [1] 136
## [1] 92
## [1] 3651
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 137
## [1] 92
## [1] 3662
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 138
## [1] 92
## [1] 3673
## [1] "data"
## [1] "iTwiner"
## [1] 139
## [1] 93
## [1] 3684
## [1] "data"
## [1] "iTwiner"
## [1] 140
## [1] 94
## [1] 3695
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 141
## [1] 94
## [1] 3706
## [1] "data"
## [1] "iTwiner"
## [1] 142
## [1] 95
## [1] 3717
## [1] "data"
## [1] "iTwiner"
## [1] 143
## [1] 96
## [1] 3728
## [1] "data"
## [1] "iTwiner"
## [1] 144
## [1] 97
## [1] 3739
## [1] "data"
## [1] "iTwiner"
## [1] 145
## [1] 98
## [1] 3750
## [1] "data"
## [1] "iTwiner"
## [1] 146
## [1] 99
## [1] 3761
## [1] "data"
## [1] "iTwiner"
3.2.5.2.1 Measures of model performance
runs_en_tr <- which(acc_cox_tr[1,]!=0)
runs_itw_tr <- which(acc_cox_tr[2,]!=0)

runs_en_ts <- which(acc_cox_tes[1,]!=0)
runs_itw_ts <- which(acc_cox_tes[2,]!=0)
  • number variables selected
# median number of variables selected
nvar_en <- nvar_selected_1[1,]
nvar_en <- nvar_en[runs_en_tr]
mean(nvar_en)
## [1] 47.93
median(nvar_en)
## [1] 45
sd(nvar_en)
## [1] 21.33914
nvar_tw <- nvar_selected_1[2,]
nvar_tw <- nvar_tw[runs_itw_tr]
mean(nvar_tw)
## [1] 45.26
median(nvar_tw)
## [1] 42
sd(nvar_tw)
## [1] 21.10992
  • Accuracy
# EN train
acc_cox_tr_EN <- acc_cox_tr[1,]
acc_cox_tr_EN <- acc_cox_tr_EN[runs_en_tr]
#hist(acc_cox_tr_EN)
mean(acc_cox_tr_EN)  
## [1] 0.8394737
median(acc_cox_tr_EN)
## [1] 0.8421053
sd(acc_cox_tr_EN)
## [1] 0.05931749
# EN test
acc_cox_tes_EN <- acc_cox_tes[1,]
acc_cox_tes_EN <- acc_cox_tes_EN[runs_en_ts]
#hist(acc_cox_tes_EN)
mean(acc_cox_tes_EN)
## [1] 0.5865052
median(acc_cox_tes_EN)
## [1] 0.5882353
sd(acc_cox_tes_EN)
## [1] 0.1018705
#iTwiner train
acc_cox_tr_iTwiner <- acc_cox_tr[2,]
acc_cox_tr_iTwiner <- acc_cox_tr_iTwiner[runs_itw_tr]
#hist(acc_cox_tr_iTwiner)
mean(acc_cox_tr_iTwiner)
## [1] 0.7313158
median(acc_cox_tr_iTwiner)
## [1] 0.7368421
sd(acc_cox_tr_iTwiner)
## [1] 0.09293091
#iTwiner test
acc_cox_tes_iTwiner <- acc_cox_tes[2,]
acc_cox_tes_iTwiner <- acc_cox_tes_iTwiner[runs_itw_ts]
#hist(acc_cox_tes_iTwiner)
mean(acc_cox_tes_iTwiner)
## [1] 0.6388125
median(acc_cox_tes_iTwiner)
## [1] 0.6470588
sd(acc_cox_tes_iTwiner)
## [1] 0.05565765
  • miscl
# EN train
miscl_cox_tr_EN <- miscl_cox_tr[1,]
miscl_cox_tr_EN <- miscl_cox_tr_EN[runs_en_tr]
mean(miscl_cox_tr_EN)  
## [1] 6.1
median(miscl_cox_tr_EN)
## [1] 6
sd(miscl_cox_tr_EN)
## [1] 2.254065
# EN test
miscl_cox_tes_EN <- miscl_cox_tes[1,]
miscl_cox_tes_EN <- miscl_cox_tes_EN[runs_en_ts]
mean(miscl_cox_tes_EN)
## [1] 7.029412
median(miscl_cox_tes_EN)
## [1] 7
sd(miscl_cox_tes_EN)
## [1] 1.731799
#iTwiner train
miscl_cox_tr_iTwiner <- miscl_cox_tr[2,]
miscl_cox_tr_iTwiner <- miscl_cox_tr_iTwiner[runs_itw_tr]
mean(miscl_cox_tr_iTwiner)
## [1] 10.21
median(miscl_cox_tr_iTwiner)
## [1] 10
sd(miscl_cox_tr_iTwiner)
## [1] 3.531375
#iTwiner test
miscl_cox_tes_iTwiner <- miscl_cox_tes[2,]
miscl_cox_tes_iTwiner <- miscl_cox_tes_iTwiner[runs_itw_ts]
mean(miscl_cox_tes_iTwiner)
## [1] 6.140187
median(miscl_cox_tes_iTwiner)
## [1] 6
sd(miscl_cox_tes_iTwiner)
## [1] 0.94618
  • False Neg
#EN
fneg_reg_en_train <- fneg_reg_tr[1,runs_en_tr]
mean(fneg_reg_en_train)
## [1] 6.1
median(fneg_reg_en_train)
## [1] 6
sd(fneg_reg_en_train)
## [1] 2.254065
fneg_reg_en_test <- fneg_reg_ts[1,runs_en_ts]
mean(fneg_reg_en_test)
## [1] 4.911765
median(fneg_reg_en_test)
## [1] 5
sd(fneg_reg_en_test)
## [1] 1.386662
#iTwiner
fneg_reg_iTwiner_train <- fneg_reg_tr[2,runs_itw_tr]
mean(fneg_reg_iTwiner_train)
## [1] 10.21
median(fneg_reg_iTwiner_train)
## [1] 10
sd(fneg_reg_iTwiner_train)
## [1] 3.531375
fneg_reg_iTwiner_test <- fneg_reg_ts[2,runs_itw_ts]
mean(fneg_reg_iTwiner_test)
## [1] 6.065421
median(fneg_reg_iTwiner_test)
## [1] 6
sd(fneg_reg_iTwiner_test)
## [1] 0.9035676
  • sensitivity
# EN train
sensitivity_cox_tr_EN <- sensitivity_cox_tr[1,]
sensitivity_cox_tr_EN <- sensitivity_cox_tr_EN[runs_en_tr]
mean(sensitivity_cox_tr_EN)  
## [1] 0.6411765
median(sensitivity_cox_tr_EN)
## [1] 0.6470588
sd(sensitivity_cox_tr_EN)
## [1] 0.132592
# EN test
sensitivity_cox_tes_EN <- sensitivity_cox_tes[1,]
sensitivity_cox_tes_EN <- sensitivity_cox_tes_EN[runs_en_ts]
mean(sensitivity_cox_tes_EN)
## [1] 0.3860294
median(sensitivity_cox_tes_EN)
## [1] 0.375
sd(sensitivity_cox_tes_EN)
## [1] 0.1733327
#iTwiner train
sensitivity_cox_tr_iTwiner <- sensitivity_cox_tr[2,]
sensitivity_cox_tr_iTwiner <- sensitivity_cox_tr_iTwiner[runs_itw_tr]
mean(sensitivity_cox_tr_iTwiner)
## [1] 0.3994118
median(sensitivity_cox_tr_iTwiner)
## [1] 0.4117647
sd(sensitivity_cox_tr_iTwiner)
## [1] 0.2077279
#iTwiner test
sensitivity_cox_tes_iTwiner <- sensitivity_cox_tes[2,]
sensitivity_cox_tes_iTwiner <- sensitivity_cox_tes_iTwiner[runs_itw_ts]
mean(sensitivity_cox_tes_iTwiner)
## [1] 0.2418224
median(sensitivity_cox_tes_iTwiner)
## [1] 0.25
sd(sensitivity_cox_tes_iTwiner)
## [1] 0.112946
  • specificity
# EN train
specificity_cox_tr_EN <- specificity_cox_tr[1,]
specificity_cox_tr_EN <- specificity_cox_tr_EN[runs_en_tr]
mean(specificity_cox_tr_EN)  
## [1] 1
median(specificity_cox_tr_EN)
## [1] 1
sd(specificity_cox_tr_EN)
## [1] 0
# EN test
specificity_cox_tes_EN <- specificity_cox_tes[1,]
specificity_cox_tes_EN <- specificity_cox_tes_EN[runs_en_ts]
mean(specificity_cox_tes_EN)
## [1] 0.7647059
median(specificity_cox_tes_EN)
## [1] 0.7777778
sd(specificity_cox_tes_EN)
## [1] 0.1664238
#iTwiner train
specificity_cox_tr_iTwiner <- specificity_cox_tr[2,]
specificity_cox_tr_iTwiner <- specificity_cox_tr_iTwiner[runs_itw_tr]
mean(specificity_cox_tr_iTwiner)
## [1] 1
median(specificity_cox_tr_iTwiner)
## [1] 1
sd(specificity_cox_tr_iTwiner)
## [1] 0
#iTwiner test
specificity_cox_tes_iTwiner <- specificity_cox_tes[2,]
specificity_cox_tes_iTwiner <- specificity_cox_tes_iTwiner[runs_itw_ts]
mean(specificity_cox_tes_iTwiner)
## [1] 0.9916926
median(specificity_cox_tes_iTwiner)
## [1] 1
sd(specificity_cox_tes_iTwiner)
## [1] 0.0395082
  • auc
# EN train
auc_cox_tr_EN <- auc_cox_tr[1,]
auc_cox_tr_EN <- auc_cox_tr_EN[runs_en_tr]
mean(auc_cox_tr_EN)  
## [1] 0.8205882
median(auc_cox_tr_EN)
## [1] 0.8235294
sd(auc_cox_tr_EN)
## [1] 0.06629602
# EN test
auc_cox_tes_EN <- auc_cox_tes[1,]
auc_cox_tes_EN <- auc_cox_tes_EN[runs_en_ts]
mean(auc_cox_tes_EN)
## [1] 0.5834014
median(auc_cox_tes_EN)
## [1] 0.5763889
sd(auc_cox_tes_EN)
## [1] 0.09536522
#iTwiner train
auc_cox_tr_iTwiner <- auc_cox_tr[2,]
auc_cox_tr_iTwiner <- auc_cox_tr_iTwiner[runs_itw_tr]
mean(auc_cox_tr_iTwiner)
## [1] 0.6997059
median(auc_cox_tr_iTwiner)
## [1] 0.7058824
sd(auc_cox_tr_iTwiner)
## [1] 0.103864
#iTwiner test
auc_cox_tes_iTwiner <- auc_cox_tes[2,]
auc_cox_tes_iTwiner <- auc_cox_tes_iTwiner[runs_itw_ts]
mean(auc_cox_tes_iTwiner)
## [1] 0.6167575
median(auc_cox_tes_iTwiner)
## [1] 0.625
sd(auc_cox_tes_iTwiner)
## [1] 0.05843036
  • Names of genes selected

Variables always selected

var_selected_alw_select_en <- var_selected_en1[runs_en_ts]
var_selected_alw_select_en1 <-  Reduce(intersect,var_selected_alw_select_en)
print(paste("variables always selected by EN = ",length(var_selected_alw_select_en1)))
## [1] "variables always selected by EN =  0"
var_selected_alw_select_iTwiner <- var_selected_iTwiner1[runs_itw_ts]
var_selected_alw_select_iTwiner1 <-  Reduce(intersect,var_selected_alw_select_iTwiner)
print(paste("variables always selected by iTwiner = ",length(var_selected_alw_select_iTwiner1)))
## [1] "variables always selected by iTwiner =  2"

Variables selected in 50 bootstrap samples

l = length(var_selected_alw_select_en)
var_selected_50_select_en <- table(unlist(var_selected_alw_select_en))
var_selected_50_select_en <- as.data.frame(var_selected_50_select_en)
var_selected_50_select_en <-  subset(var_selected_50_select_en, Freq > 0.50*l)
print(paste("variables selected 50% by EN = ",length(var_selected_50_select_en$Var1)))
## [1] "variables selected 50% by EN =  4"
var_selected_50_select_en$Var1
## [1] CD200   MIR602  NCOR1P4 RLN3   
## 1392 Levels: A2M-AS1 AANAT ABITRAMP1 ABR ACP3 ACRBP ACTBP2 ACTBP9 ... ZSCAN12P1
# 
l = length(var_selected_alw_select_iTwiner)
var_selected_50_select_iTwiner <- table(unlist(var_selected_alw_select_iTwiner))
var_selected_50_select_iTwiner <- as.data.frame(var_selected_50_select_iTwiner)
var_selected_50_select_iTwiner <-  subset(var_selected_50_select_iTwiner, Freq > 0.50*l)
print(paste("variables selected 50% by iTwiner = ",length(var_selected_50_select_iTwiner$Var1)))
## [1] "variables selected 50% by iTwiner =  25"
var_selected_50_select_iTwiner$Var1
##  [1] CHMP1AP1  DNTT      EEF1B2P6  GZMAP1    HAUS5-DT  HORMAD2   HSPD1P7  
##  [8] KCTD9P3   LINC02543 LRIT1     MIR3907   MIR564    MIR602    NDUFA5P10
## [15] OR1S2     OR51K1P   PRSS57    RAC1P3    RN7SKP32  RNU6-428P RNU6-552P
## [22] RPL32P17  TPT1P3    TRBV11-1  XRCC6P2  
## 237 Levels: ABCD1P3 ACTN4P1 ALDH7A1P4 ANKRD20A10P API5P1 APOOP4 ... ZNF725P
  • Select the 50 most frequent variables
var_selected_50_select_en <- table(unlist(var_selected_alw_select_en))
var_selected_50_select_en <- as.data.frame(var_selected_50_select_en)
var_selected_50_select_en <- var_selected_50_select_en[order(var_selected_50_select_en$Freq, decreasing = T),]
hist(var_selected_50_select_en$Freq)

top100_en <- var_selected_50_select_en[1:100,]
top100_en <- top100_en$Var1
top50_en <- top100_en[1:50]
top100_en <- as.data.frame(top100_en)
write_xlsx(top100_en,"List_top100_en_d2.xlsx")

var_selected_50_select_iTwiner <- table(unlist(var_selected_alw_select_iTwiner))
var_selected_50_select_iTwiner <- as.data.frame(var_selected_50_select_iTwiner)
var_selected_50_select_iTwiner <- var_selected_50_select_iTwiner[order(var_selected_50_select_iTwiner$Freq, decreasing = T),]
hist(var_selected_50_select_iTwiner$Freq)

top100_itw <- var_selected_50_select_iTwiner[1:100,]
top100_itw <- top100_itw$Var1
top50_itw <- top100_itw[1:50]
top100_itw <- as.data.frame(top100_itw)
write_xlsx(top100_itw,"List_top100_itw_d2.xlsx")

variables in common between EN and iTwiner

common_var_selected_50_en_iTwiner <- var_selected_50_select_iTwiner$Var1[which(var_selected_50_select_iTwiner$Var1 %in% var_selected_50_select_en$Var1)]
length(common_var_selected_50_en_iTwiner)
## [1] 50
common_var_selected_50_en_iTwiner
##  [1] MIR602       RAC1P3       XRCC6P2      MIR3907      KCTD9P3     
##  [6] TRBV11-1     GZMAP1       PRSS57       LRIT1        LINC02543   
## [11] RNU6-428P    HAUS5-DT     DNTT         RN7SKP32     TPT1P3      
## [16] OR1S2        RPL32P17     HORMAD2      EEF1B2P6     MIR564      
## [21] SULT6B2P     MTND1P22     RNU1-18P     TRBJ1-6      SCDP1       
## [26] LINC02868    MIR6508      LINC01100    MAGEB6       RNU6-542P   
## [31] MIR376B      TRAJ40       TRAJ58       RNU7-193P    MIR6792     
## [36] ANKRD20A10P  MIR4301      DBF4P3       MIR509-2     RPL34P27    
## [41] CLUHP5       DEFB4B       KDM4F        LINC01927    CDH12P4     
## [46] HMGB1P50     IGKV1OR2-118 MIR6729      RNA5SP42     RNU4-92P    
## 237 Levels: ABCD1P3 ACTN4P1 ALDH7A1P4 ANKRD20A10P API5P1 APOOP4 ... ZNF725P
  • Variables selected by EN and iTwiner that are DEGs
xdata <- rnaseq1[ order(row.names(rnaseq1)), ]

nomesgenes <- c(as.vector(top50_en),as.vector(top50_itw))
xdata <- xdata[,nomesgenes]
rownames(DATASET1_bal) <- DATASET1_bal$ID
ydata <- as.data.frame(DATASET1_bal[ order(row.names(DATASET1_bal)), ])
rownames(ydata) <- ydata$ID

xdata <- xdata[rownames(xdata) %in% 
                 rownames(ydata),]
ydata <- as.data.frame(ydata[rownames(ydata) %in% 
                               rownames(xdata),])

# keep features with standard deviation > 0
xdata <- xdata[,sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])}) != 0]
xdata <- t(xdata)

group <- as.factor(ydata$class)
class <- as.data.frame(ydata$class)
edgeR.DGElist <- DGEList(counts=xdata, group = group)

# remove genes that do not have one count per million in at least 5 samples
keep <- rowSums(cpm(edgeR.DGElist) >= 1) >= 5
edgeR.DGElist <- edgeR.DGElist[keep ,]

# specify the design setup 

design <- model.matrix(~group)

# estimate the dispersion for all read counts across all samples
edgeR.DGElist <- estimateDisp(edgeR.DGElist, design)

# fit the negative binomial model
edger_fit <- glmFit(edgeR.DGElist, design )

# perform the testing for every gene using the neg. binomial model
edger_lrt <- glmLRT(edger_fit)
summary(decideTests(edger_lrt))
##        groupPm
## Down         2
## NotSig      95
## Up           3
# extract results from edger _lrt$ table plus adjusted p- values
DGE.results_edgeR <- topTags(edger_lrt, n = Inf , sort.by = "PValue" , adjust.method = "BH" )

topTags(DGE.results_edgeR) #table with the top10 DEGs
## Coefficient:  groupPm 
##               logFC   logCPM        LR       PValue         FDR
## RN7SKP32   2.417523 10.98057 18.865397 1.402743e-05 0.001402743
## TPT1P3     2.034975 10.93476 13.937757 1.889644e-04 0.009448219
## XRCC6P2   -2.524191 11.11118 11.000408 9.109182e-04 0.022772954
## XRCC6P2.1 -2.524191 11.11118 11.000408 9.109182e-04 0.022772954
## MIR6508    1.870106 10.99754 10.186838 1.414466e-03 0.028289327
## LINC01100 -2.410049 11.08279  7.675900 5.596316e-03 0.089843327
## TMPRSS11F -2.376134 11.07022  7.277603 6.981966e-03 0.089843327
## OR51K1P   -2.170233 11.04336  7.225512 7.187466e-03 0.089843327
## KIR2DL4   -1.338876 12.93927  6.650194 9.914503e-03 0.094843330
## MIR7854    1.225777 15.14133  6.396702 1.143326e-02 0.094843330
genes_deg <- DGE.results_edgeR$table
genes_deg <- genes_deg[which(genes_deg$FDR < 0.05),]
dim(genes_deg)#genes found to be differentially expressed
## [1] 5 5
genes_deg$row <- row.names(genes_deg)
write_xlsx(genes_deg,"genes_deg_enitw_d2.xlsx")

# highly expressed genes
high <- genes_deg[genes_deg$logFC > 0,]
dim(high)
## [1] 3 6
#low expressed genes
low <- genes_deg[genes_deg$logFC < 0,]
dim(low)
## [1] 2 6

3.2.5.3 3) Classification based on genes selected by regularized logistic regression

3.2.5.3.1 EN + Classifiers
xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

#ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
colnames(ydata) <- c("class","id")

names <- as.vector(top50_en)
xdata_en <- as.data.frame(xdata[,names])

nomesgenes <- colnames(xdata_en)
colnames(xdata_en) <- paste0("Var", 1:50)
xdata_en$type <- as.factor(ydata$class)
ydata$class <- as.numeric(ydata$class)
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.2.5.3.1.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_enplus_train[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.9939474
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01113033
acc_svm <- acc_enplus_train[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.9321053
median(acc_svm)
## [1] 0.9210526
sd(acc_svm)
## [1] 0.02596254
acc_svmR <- acc_enplus_train[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.8781579
median(acc_svmR)
## [1] 0.9210526
sd(acc_svmR)
## [1] 0.07689654
acc_logs <- acc_enplus_train[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_enplus_train[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 1
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0
  • auc
# auc
auc_trees <- auc_enplus_train[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.9941317
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01086274
auc_svm <- auc_enplus_train[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.927591
median(auc_svm)
## [1] 0.9173669
sd(auc_svm)
## [1] 0.02755802
auc_svmR <- auc_enplus_train[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.8792297
median(auc_svmR)
## [1] 0.9117647
sd(auc_svmR)
## [1] 0.06996116
auc_logs <- auc_enplus_train[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_enplus_train[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 1
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0
  • Misclassification
# miscl
miscl_trees <- miscl_enplus_train[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 0.23
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.4229526
miscl_svm <- miscl_enplus_train[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 2.58
median(miscl_svm)
## [1] 3
sd(miscl_svm)
## [1] 0.9865766
miscl_svmR <- miscl_enplus_train[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 4.63
median(miscl_svmR)
## [1] 3
sd(miscl_svmR)
## [1] 2.922069
miscl_logs <- miscl_enplus_train[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_enplus_train[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 0
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_enplus_train[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.9958824
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.01508426
sensitivity_svm <- sensitivity_enplus_train[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.8847059
median(sensitivity_svm)
## [1] 0.8823529
sd(sensitivity_svm)
## [1] 0.04797096
sensitivity_svmR <- sensitivity_enplus_train[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.8894118
median(sensitivity_svmR)
## [1] 0.8823529
sd(sensitivity_svmR)
## [1] 0.0610282
sensitivity_logs <- sensitivity_enplus_train[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_enplus_train[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 1
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0
  • Specificity
# specificity
specificity_trees <- specificity_enplus_train[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.992381
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.01754538
specificity_svm <- specificity_enplus_train[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9704762
median(specificity_svm)
## [1] 0.952381
sd(specificity_svm)
## [1] 0.02323008
specificity_svmR <- specificity_enplus_train[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.8690476
median(specificity_svmR)
## [1] 0.952381
sd(specificity_svmR)
## [1] 0.1464789
specificity_logs <- specificity_enplus_train[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_enplus_train[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_enplus_train[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 0.07
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.2564324
fneg_svm <- fneg_enplus_train[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 1.96
median(fneg_svm)
## [1] 2
sd(fneg_svm)
## [1] 0.8155063
fneg_svmR <- fneg_enplus_train[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 1.88
median(fneg_svmR)
## [1] 2
sd(fneg_svmR)
## [1] 1.037479
fneg_logs <- fneg_enplus_train[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_enplus_train[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 0
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0
3.2.5.3.1.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc_enplus[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.6611765
median(acc_trees)
## [1] 0.6470588
sd(acc_trees)
## [1] 0.09277112
acc_svm <- acc_enplus[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.7217647
median(acc_svm)
## [1] 0.7058824
sd(acc_svm)
## [1] 0.0796935
acc_svmR <- acc_enplus[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.7723529
median(acc_svmR)
## [1] 0.7647059
sd(acc_svmR)
## [1] 0.09483351
acc_logs <- acc_enplus[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.7070588
median(acc_logs)
## [1] 0.7058824
sd(acc_logs)
## [1] 0.09234062
acc_rf <- acc_enplus[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.7835294
median(acc_rf)
## [1] 0.7647059
sd(acc_rf)
## [1] 0.07793067
  • auc
# auc
auc_trees <- auc_enplus[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.6629861
median(auc_trees)
## [1] 0.6527778
sd(auc_trees)
## [1] 0.08695739
auc_svm <- auc_enplus[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.7148611
median(auc_svm)
## [1] 0.7048611
sd(auc_svm)
## [1] 0.0820896
auc_svmR <- auc_enplus[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.7766667
median(auc_svmR)
## [1] 0.7708333
sd(auc_svmR)
## [1] 0.0915169
auc_logs <- auc_enplus[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.7030556
median(auc_logs)
## [1] 0.7013889
sd(auc_logs)
## [1] 0.09303933
auc_rf <- auc_enplus[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.7770833
median(auc_rf)
## [1] 0.7638889
sd(auc_rf)
## [1] 0.0787563
  • Misclassification
# miscl
miscl_trees <- miscl_enplus[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 5.76
median(miscl_trees)
## [1] 6
sd(miscl_trees)
## [1] 1.577109
miscl_svm <- miscl_enplus[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 4.73
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.354789
miscl_svmR <- miscl_enplus[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 3.87
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 1.61217
miscl_logs <- miscl_enplus[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 4.98
median(miscl_logs)
## [1] 5
sd(miscl_logs)
## [1] 1.569791
miscl_rf <- miscl_enplus[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 3.68
median(miscl_rf)
## [1] 4
sd(miscl_rf)
## [1] 1.324821
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_enplus[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.64375
median(sensitivity_trees)
## [1] 0.625
sd(sensitivity_trees)
## [1] 0.1602702
sensitivity_svm <- sensitivity_enplus[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.5975
median(sensitivity_svm)
## [1] 0.625
sd(sensitivity_svm)
## [1] 0.1653165
sensitivity_svmR <- sensitivity_enplus[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.85
median(sensitivity_svmR)
## [1] 0.875
sd(sensitivity_svmR)
## [1] 0.1204998
sensitivity_logs <- sensitivity_enplus[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.635
median(sensitivity_logs)
## [1] 0.625
sd(sensitivity_logs)
## [1] 0.1663634
sensitivity_rf <- sensitivity_enplus[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.6675
median(sensitivity_rf)
## [1] 0.75
sd(sensitivity_rf)
## [1] 0.1296119
  • Specificity
# specificity
specificity_trees <- specificity_enplus[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.6766667
median(specificity_trees)
## [1] 0.6666667
sd(specificity_trees)
## [1] 0.1679487
specificity_svm <- specificity_enplus[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.8322222
median(specificity_svm)
## [1] 0.8888889
sd(specificity_svm)
## [1] 0.1122223
specificity_svmR <- specificity_enplus[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.7033333
median(specificity_svmR)
## [1] 0.6666667
sd(specificity_svmR)
## [1] 0.1835313
specificity_logs <- specificity_enplus[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.7711111
median(specificity_logs)
## [1] 0.7777778
sd(specificity_logs)
## [1] 0.1463016
specificity_rf <- specificity_enplus[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 0.8866667
median(specificity_rf)
## [1] 0.8888889
sd(specificity_rf)
## [1] 0.1070876
  • False Neg
# fneg
fneg_trees <- fneg_enplus[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 2.85
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.282162
fneg_svm <- fneg_enplus[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 3.22
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 1.322532
fneg_svmR <- fneg_enplus[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 1.2
median(fneg_svmR)
## [1] 1
sd(fneg_svmR)
## [1] 0.9639984
fneg_logs <- fneg_enplus[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 2.92
median(fneg_logs)
## [1] 3
sd(fneg_logs)
## [1] 1.330907
fneg_rf <- fneg_enplus[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 2.66
median(fneg_rf)
## [1] 2
sd(fneg_rf)
## [1] 1.036895
3.2.5.3.2 iTwiner + classifiers
xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

#ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
colnames(ydata) <- c("class","id")

names <- as.vector(top50_itw)
xdata_iTwiner <- as.data.frame(xdata[,names])

nomesgenes <- colnames(xdata_iTwiner)
colnames(xdata_iTwiner) <- paste0("Var", 1:50)
xdata_iTwiner$type <- as.factor(ydata$class)
ydata$class <- as.numeric(ydata$class)
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.2.5.3.2.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_tcoxplus_train[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.9755263
median(acc_trees)
## [1] 0.9736842
sd(acc_trees)
## [1] 0.02091094
acc_svm <- acc_tcoxplus_train[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.8694737
median(acc_svm)
## [1] 0.8684211
sd(acc_svm)
## [1] 0.02746558
acc_svmR <- acc_tcoxplus_train[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.8918421
median(acc_svmR)
## [1] 0.8947368
sd(acc_svmR)
## [1] 0.0776425
acc_logs <- acc_tcoxplus_train[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.9957895
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0.00969613
acc_rf <- acc_tcoxplus_train[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.9634211
median(acc_rf)
## [1] 0.9736842
sd(acc_rf)
## [1] 0.02421001
  • auc
# auc
auc_trees <- auc_tcoxplus_train[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.9756723
median(auc_trees)
## [1] 0.9761905
sd(auc_trees)
## [1] 0.02088274
auc_svm <- auc_tcoxplus_train[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.8541176
median(auc_svm)
## [1] 0.8529412
sd(auc_svm)
## [1] 0.03069682
auc_svmR <- auc_tcoxplus_train[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.8822549
median(auc_svmR)
## [1] 0.8823529
sd(auc_svmR)
## [1] 0.08275711
auc_logs <- auc_tcoxplus_train[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.9957983
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0.009740052
auc_rf <- auc_tcoxplus_train[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.9591176
median(auc_rf)
## [1] 0.9705882
sd(auc_rf)
## [1] 0.02705824
  • Misclassification
# miscl
miscl_trees <- miscl_tcoxplus_train[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 0.93
median(miscl_trees)
## [1] 1
sd(miscl_trees)
## [1] 0.7946157
miscl_svm <- miscl_tcoxplus_train[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 4.96
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.043692
miscl_svmR <- miscl_tcoxplus_train[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 4.11
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 2.950415
miscl_logs <- miscl_tcoxplus_train[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 0.16
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0.3684529
miscl_rf <- miscl_tcoxplus_train[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 1.39
median(miscl_rf)
## [1] 1
sd(miscl_rf)
## [1] 0.9199802
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_tcoxplus_train[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.9770588
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.03116576
sensitivity_svm <- sensitivity_tcoxplus_train[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.7082353
median(sensitivity_svm)
## [1] 0.7058824
sd(sensitivity_svm)
## [1] 0.06139365
sensitivity_svmR <- sensitivity_tcoxplus_train[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.7911765
median(sensitivity_svmR)
## [1] 0.7941176
sd(sensitivity_svmR)
## [1] 0.1465827
sensitivity_logs <- sensitivity_tcoxplus_train[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.9958824
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0.01508426
sensitivity_rf <- sensitivity_tcoxplus_train[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.9182353
median(sensitivity_rf)
## [1] 0.9411765
sd(sensitivity_rf)
## [1] 0.05411648
  • Specificity
# specificity
specificity_trees <- specificity_tcoxplus_train[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.9742857
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.02982653
specificity_svm <- specificity_tcoxplus_train[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 1
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0
specificity_svmR <- specificity_tcoxplus_train[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.9733333
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.06779097
specificity_logs <- specificity_tcoxplus_train[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.9957143
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0.01369636
specificity_rf <- specificity_tcoxplus_train[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_tcoxplus_train[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 0.39
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.529818
fneg_svm <- fneg_tcoxplus_train[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 4.96
median(fneg_svm)
## [1] 5
sd(fneg_svm)
## [1] 1.043692
fneg_svmR <- fneg_tcoxplus_train[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 3.55
median(fneg_svmR)
## [1] 3.5
sd(fneg_svmR)
## [1] 2.491906
fneg_logs <- fneg_tcoxplus_train[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 0.07
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0.2564324
fneg_rf <- fneg_tcoxplus_train[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 1.39
median(fneg_rf)
## [1] 1
sd(fneg_rf)
## [1] 0.9199802
3.2.5.3.2.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc_tcoxplus[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.6764706
median(acc_trees)
## [1] 0.7058824
sd(acc_trees)
## [1] 0.08546861
acc_svm <- acc_tcoxplus[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.7705882
median(acc_svm)
## [1] 0.7647059
sd(acc_svm)
## [1] 0.0894646
acc_svmR <- acc_tcoxplus[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.7241176
median(acc_svmR)
## [1] 0.7058824
sd(acc_svmR)
## [1] 0.0973936
acc_logs <- acc_tcoxplus[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.6611765
median(acc_logs)
## [1] 0.6470588
sd(acc_logs)
## [1] 0.09718703
acc_rf <- acc_tcoxplus[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.8394118
median(acc_rf)
## [1] 0.8235294
sd(acc_rf)
## [1] 0.05783785
  • auc
# auc
auc_trees <- auc_tcoxplus[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.6742361
median(auc_trees)
## [1] 0.7013889
sd(auc_trees)
## [1] 0.08534957
auc_svm <- auc_tcoxplus[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.7588194
median(auc_svm)
## [1] 0.75
sd(auc_svm)
## [1] 0.09193715
auc_svmR <- auc_tcoxplus[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.7232639
median(auc_svmR)
## [1] 0.7152778
sd(auc_svmR)
## [1] 0.09580586
auc_logs <- auc_tcoxplus[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.6525
median(auc_logs)
## [1] 0.6388889
sd(auc_logs)
## [1] 0.09724186
auc_rf <- auc_tcoxplus[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.8302083
median(auc_rf)
## [1] 0.8125
sd(auc_rf)
## [1] 0.06119949
  • Misclassification
# miscl
miscl_trees <- miscl_tcoxplus[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 5.5
median(miscl_trees)
## [1] 5
sd(miscl_trees)
## [1] 1.452966
miscl_svm <- miscl_tcoxplus[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 3.9
median(miscl_svm)
## [1] 4
sd(miscl_svm)
## [1] 1.520898
miscl_svmR <- miscl_tcoxplus[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 4.69
median(miscl_svmR)
## [1] 5
sd(miscl_svmR)
## [1] 1.655691
miscl_logs <- miscl_tcoxplus[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 5.76
median(miscl_logs)
## [1] 6
sd(miscl_logs)
## [1] 1.652179
miscl_rf <- miscl_tcoxplus[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 2.73
median(miscl_rf)
## [1] 3
sd(miscl_rf)
## [1] 0.9832434
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_tcoxplus[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.62625
median(sensitivity_trees)
## [1] 0.625
sd(sensitivity_trees)
## [1] 0.1680764
sensitivity_svm <- sensitivity_tcoxplus[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.55875
median(sensitivity_svm)
## [1] 0.625
sd(sensitivity_svm)
## [1] 0.1522713
sensitivity_svmR <- sensitivity_tcoxplus[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.70875
median(sensitivity_svmR)
## [1] 0.75
sd(sensitivity_svmR)
## [1] 0.1657694
sensitivity_logs <- sensitivity_tcoxplus[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.495
median(sensitivity_logs)
## [1] 0.5
sd(sensitivity_logs)
## [1] 0.1758098
sensitivity_rf <- sensitivity_tcoxplus[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.67375
median(sensitivity_rf)
## [1] 0.625
sd(sensitivity_rf)
## [1] 0.1254977
  • Specificity
# specificity
specificity_trees <- specificity_tcoxplus[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.7211111
median(specificity_trees)
## [1] 0.7777778
sd(specificity_trees)
## [1] 0.1425674
specificity_svm <- specificity_tcoxplus[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9588889
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.08453861
specificity_svmR <- specificity_tcoxplus[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.7377778
median(specificity_svmR)
## [1] 0.7777778
sd(specificity_svmR)
## [1] 0.1878728
specificity_logs <- specificity_tcoxplus[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.8088889
median(specificity_logs)
## [1] 0.8888889
sd(specificity_logs)
## [1] 0.1507184
specificity_rf <- specificity_tcoxplus[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 0.9866667
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0.03957626
  • False Neg
# fneg
fneg_trees <- fneg_tcoxplus[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 2.99
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.344611
fneg_svm <- fneg_tcoxplus[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 3.53
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 1.218171
fneg_svmR <- fneg_tcoxplus[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 2.33
median(fneg_svmR)
## [1] 2
sd(fneg_svmR)
## [1] 1.326155
fneg_logs <- fneg_tcoxplus[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 4.04
median(fneg_logs)
## [1] 4
sd(fneg_logs)
## [1] 1.406479
fneg_rf <- fneg_tcoxplus[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 2.61
median(fneg_rf)
## [1] 3
sd(fneg_rf)
## [1] 1.003982
# save results

#save.image("~/results2_2010_final.RData")

3.2.6 Data final

dt2 <- acc[1,]
dt_en2 <- acc_enplus[1,]
dt_iTwiner2 <- acc_tcoxplus[1,]
#dt_hub <- acc_hubplus[1,]

acc_dt2 <- as.data.frame(c(dt2,dt_en2, dt_iTwiner2
                           #,dt_hub
))
colnames(acc_dt2) <- "acc"
#acc_dt$group <- "HUB + DT"
#acc_dt$group[1:100] <- "DT"
acc_dt2$group <- "DT"
acc_dt2$group[101:200] <- "EN + DT"
acc_dt2$group[201:300] <- "iTwiner + DT"
acc_dt2<- acc_dt2 %>% mutate_if(is.character,factor)
# acc_dt$group <- ordered(acc_dt$group, levels = c("DT", "EN + DT","iTwiner + DT", "HUB + DT"))
acc_dt2$group <- ordered(acc_dt2$group, levels = c("DT", "EN + DT","iTwiner + DT"))

acc_dt2$dataset <- "DATASET2"



svmL2 <- acc[2,]
svmL_en2 <- acc_enplus[2,]
svmL_iTwiner2 <- acc_tcoxplus[2,]

acc_svmL2 <- as.data.frame(c(svmL2,svmL_en2, svmL_iTwiner2
                             #,svmL_hub
))
colnames(acc_svmL2) <- "acc"
acc_svmL2$group <- "svmL"
acc_svmL2$group[101:200] <- "EN + svmL"
acc_svmL2$group[201:300] <- "iTwiner + svmL"
acc_svmL2<- acc_svmL2 %>% mutate_if(is.character,factor)
acc_svmL2$group <- ordered(acc_svmL2$group, levels = c("svmL", "EN + svmL","iTwiner + svmL"))

acc_svmL2$dataset <- "DATASET2"




svmR2 <- acc[3,]
svmR_en2 <- acc_enplus[3,]
svmR_iTwiner2 <- acc_tcoxplus[3,]

acc_svmR2 <- as.data.frame(c(svmR2,svmR_en2, svmR_iTwiner2
                             #,svmR_hub
))
colnames(acc_svmR2) <- "acc"
acc_svmR2$group <- "svmR"
acc_svmR2$group[101:200] <- "EN + svmR"
acc_svmR2$group[201:300] <- "iTwiner + svmR"
acc_svmR2<- acc_svmR2 %>% mutate_if(is.character,factor)
acc_svmR2$group <- ordered(acc_svmR2$group, levels = c("svmR", "EN + svmR","iTwiner + svmR"))

acc_svmR2$dataset <- "DATASET2"






logist2 <- acc[4,]
logist_en2 <- acc_enplus[4,]
logist_iTwiner2 <- acc_tcoxplus[4,]

acc_logist2 <- as.data.frame(c(logist2,logist_en2, logist_iTwiner2
                               #,logist_hub
))
colnames(acc_logist2) <- "acc"
acc_logist2$group <- "logist"
acc_logist2$group[101:200] <- "EN + logist"
acc_logist2$group[201:300] <- "iTwiner + logist"
acc_logist2<- acc_logist2 %>% mutate_if(is.character,factor)
acc_logist2$group <- ordered(acc_logist2$group, levels = c("logist", "EN + logist","iTwiner + logist"))

acc_logist2$dataset <- "DATASET2"






rf2 <- acc[5,]
rf_en2 <- acc_enplus[5,]
rf_iTwiner2 <- acc_tcoxplus[5,]

acc_rf2 <- as.data.frame(c(rf2,rf_en2, rf_iTwiner2
                           #,rf_hub
))
colnames(acc_rf2) <- "acc"
acc_rf2$group <- "rf"
acc_rf2$group[101:200] <- "EN + rf"
acc_rf2$group[201:300] <- "iTwiner + rf"
acc_rf2<- acc_rf2 %>% mutate_if(is.character,factor)
acc_rf2$group <- ordered(acc_rf2$group, levels = c("rf", "EN + rf","iTwiner + rf"))

acc_rf2$dataset <- "DATASET2"

3.3 DATASET 3

3.3.1 DATA

load("~/CRC_LCosta/results/results3_2010_final.RData")
DATASET1_bal <- df3
DATASET1 <- DATASET3_bal
clinic1 <- clinic3_bal
datasurv1 <- datasurv3
clinic1_bal <- clinic3_bal
  • Split dataset in two groups for classification: – P -> non-metastatic (CLASS = 1) – PM -> metastatic (CLASS = 0)
# primM <- DATASET1 %>%
#   filter(str_detect(class, "m"))
# rownames(primM) <- primM$ID
# primN <- DATASET1 %>%
#   filter(!str_detect(class, "Pm"))
# rownames(primN) <- primN$ID
# 
# clinic_prim <- DATASET1
# 
# rnaprimM <- rnaseq1[rownames(rnaseq1) %in% 
#                          rownames(primM),]
# 
# rnaprimN <- rnaseq1[rownames(rnaseq1) %in% 
#                          rownames(primN),]
# 
# rnaprim <- rbind(rnaprimM,rnaprimN)
  • removing variables with sd=0
# xmet <- rnaprimM [,sapply(seq(ncol(rnaprimM)), function(ix) {sd(rnaprimM[,ix])}) != 0] 
# xnon <- rnaprimN[,sapply(seq(ncol(rnaprimN)), function(ix) {sd(rnaprimN[,ix])}) != 0] 
# 
# xmet_less <- xmet[,which(colnames(xmet) %in% colnames(xnon))]
# xnon_less <- xnon[,which(colnames(xnon) %in% colnames(xmet))]
# 
# # normalizing data
# xmet_norm <- scale(log2(xmet_less+1)) 
# xnon_norm <- scale(log2(xnon_less+1)) 
# 
# xdataT <- rbind(xmet_less,xnon_less)
# xdataT <- xdataT[ order(row.names(xdataT)), ]
# 
# 
# rm(xmet,xmet_less,xnon,xnon_less,rnaprimM,rnaprimN)
  • weight vector that penalizes genes with greater distances between Pm and P correlation matrices - TWINER
# #xmet_cor <- Matrix(cor(xmet_norm), sparse = TRUE)
# xmet_cor <- cor(xmet_norm)
# #xmet_cor <- as.data.frame(xmet_cor)
# xnon_cor <- cor(xnon_norm)
# #xnon_cor <- as.data.frame(xnon_cor)
# 
# # angular distance
# ang_weight <- vector()
# for (i in 1:dim(xmet_cor)[2]){ 
# ang_weight[i] <- acos(cosine(xmet_cor[,i],xnon_cor[,i]))/pi
# }
# 
# ## normalized weights
# 
# weights <- ang_weight / max(ang_weight)
# hist(weights,main="w")
# 
# 
# pen_weight3 <- 1 / weights
# hist(pen_weight3, main="1 / w")
# 
# rm(xmet_cor,xnon_cor)

3.3.2 Exploratory analysis

  • All dataset x
clinical <- as.data.frame(clinic1_bal)


basic_eda <- function(clinical)
{
  glimpse(clinical)
  #df_Status(clinical)
  freq(clinical) 
  profiling_num(clinical)
  plot_num(clinical)
  describe(clinic1)
}
basic_eda(clinical)
## Rows: 55
## Columns: 6
## $ class     <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", …
## $ organ     <chr> "rectum", "colon", "colon", "colon", "rectum", "rectum", "co…
## $ Sex       <chr> "m", "m", "m", "f", "f", "f", "m", "m", "f", "m", "f", "f", …
## $ Age       <dbl> 77, 85, 74, 78, 69, 61, 66, 76, 59, 73, 62, 71, 61, 74, 77, …
## $ Stage     <chr> "II", "II", "III", "II", "III", "II", "III", "II", "III", "I…
## $ sidedness <chr> "rectum", "right", "rectum", NA, "rectum", "rectum", "left",…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1    Pm        30      54.55           54.55
## 2     P        25      45.45          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        47      85.45           85.45
## 2 rectum         8      14.55          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        30      54.55           54.55
## 2   m        25      45.45          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1   III        31      56.36           56.36
## 2    II        24      43.64          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1     right        24      43.64           43.64
## 2      left        17      30.91           74.55
## 3    rectum        10      18.18           92.73
## 4      <NA>         4       7.27          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinic1 
## 
##  6  Variables      55  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct 
##       55        0        2 
##                       
## Value          P    Pm
## Frequency     25    30
## Proportion 0.455 0.545
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       55        0        2 
##                         
## Value       colon rectum
## Frequency      47      8
## Proportion  0.855  0.145
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       55        0        2 
##                       
## Value          f     m
## Frequency     30    25
## Proportion 0.545 0.455
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       48        7       26    0.997     69.5    13.24     53.0     56.7 
##      .25      .50      .75      .90      .95 
##     61.0     71.5     78.0     85.0     85.0 
## 
## lowest : 37 39 53 56 57, highest: 78 81 85 86 87
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       55        0        2 
##                       
## Value         II   III
## Frequency     24    31
## Proportion 0.436 0.564
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       51        4        3 
##                                
## Value        left rectum  right
## Frequency      17     10     24
## Proportion  0.333  0.196  0.471
## --------------------------------------------------------------------------------
a <- na.omit(clinical$Age)
mean(a)
## [1] 69.5
  • Only patients that do not metastasize
clinical_p <- clinic1_bal %>%
  filter(!str_detect(class, "Pm"))


basic_eda <- function(clinical_p)
{
  glimpse(clinical_p)
  #df_Status(clinical_p)
  freq(clinical_p) 
  profiling_num(clinical_p)
  plot_num(clinical_p)
  describe(clinical_p)
}
basic_eda(clinical_p)
## Rows: 25
## Columns: 6
## $ class     <chr> "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", …
## $ organ     <chr> "rectum", "colon", "colon", "colon", "rectum", "rectum", "co…
## $ Sex       <chr> "m", "m", "m", "f", "f", "f", "m", "m", "f", "m", "f", "f", …
## $ Age       <dbl> 77, 85, 74, 78, 69, 61, 66, 76, 59, 73, 62, 71, 61, 74, 77, …
## $ Stage     <chr> "II", "II", "III", "II", "III", "II", "III", "II", "III", "I…
## $ sidedness <chr> "rectum", "right", "rectum", NA, "rectum", "rectum", "left",…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1     P        25        100             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        22         88              88
## 2 rectum         3         12             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        13         52              52
## 2   m        12         48             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1    II        14         56              56
## 2   III        11         44             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1     right        15         60              60
## 2    rectum         5         20              80
## 3      left         4         16              96
## 4      <NA>         1          4             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinical_p 
## 
##  6  Variables      25  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct    value 
##       25        0        1        P 
##              
## Value       P
## Frequency  25
## Proportion  1
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       25        0        2 
##                         
## Value       colon rectum
## Frequency      22      3
## Proportion   0.88   0.12
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       25        0        2 
##                     
## Value         f    m
## Frequency    13   12
## Proportion 0.52 0.48
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       22        3       16    0.995    70.36    12.78    53.30    59.20 
##      .25      .50      .75      .90      .95 
##    62.50    73.50    77.75    84.60    85.00 
## 
## lowest : 39 53 59 61 62, highest: 76 77 78 81 85
##                                                                             
## Value         39    53    59    61    62    64    66    69    71    73    74
## Frequency      1     1     1     2     1     1     1     1     1     1     2
## Proportion 0.045 0.045 0.045 0.091 0.045 0.045 0.045 0.045 0.045 0.045 0.091
##                                         
## Value         76    77    78    81    85
## Frequency      1     2     2     1     3
## Proportion 0.045 0.091 0.091 0.045 0.136
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       25        0        2 
##                     
## Value        II  III
## Frequency    14   11
## Proportion 0.56 0.44
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       24        1        3 
##                                
## Value        left rectum  right
## Frequency       4      5     15
## Proportion  0.167  0.208  0.625
## --------------------------------------------------------------------------------
a <- na.omit(clinical_p$Age)
mean(a)
## [1] 70.36364
  • Only patients that do metastasize
clinical_Pm <- clinic1_bal%>%
  filter(str_detect(class, "Pm"))


basic_eda <- function(clinical_Pm)
{
  glimpse(clinical_Pm)
  #df_Status(clinical_Pm)
  freq(clinical_Pm) 
  profiling_num(clinical_Pm)
  plot_num(clinical_Pm)
  describe(clinical_Pm)
}
basic_eda(clinical_Pm)
## Rows: 30
## Columns: 6
## $ class     <chr> "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", "Pm", …
## $ organ     <chr> "rectum", "colon", "rectum", "colon", "colon", "colon", "rec…
## $ Sex       <chr> "f", "f", "f", "m", "f", "f", "m", "f", "f", "m", "m", "f", …
## $ Age       <dbl> 58, 85, 62, 67, 75, 69, 58, 57, 72, 65, 61, 78, 81, 57, 74, …
## $ Stage     <chr> "II", "II", "III", "III", "III", "II", "II", "III", "II", "I…
## $ sidedness <chr> "left", "right", "rectum", "rectum", "left", "left", "rectum…
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##   class frequency percentage cumulative_perc
## 1    Pm        30        100             100
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    organ frequency percentage cumulative_perc
## 1  colon        25      83.33           83.33
## 2 rectum         5      16.67          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Sex frequency percentage cumulative_perc
## 1   f        17      56.67           56.67
## 2   m        13      43.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Stage frequency percentage cumulative_perc
## 1   III        20      66.67           66.67
## 2    II        10      33.33          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   sidedness frequency percentage cumulative_perc
## 1      left        13      43.33           43.33
## 2     right         9      30.00           73.33
## 3    rectum         5      16.67           90.00
## 4      <NA>         3      10.00          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## clinical_Pm 
## 
##  6  Variables      30  Observations
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct    value 
##       30        0        1       Pm 
##              
## Value      Pm
## Frequency  30
## Proportion  1
## --------------------------------------------------------------------------------
## organ 
##        n  missing distinct 
##       30        0        2 
##                         
## Value       colon rectum
## Frequency      25      5
## Proportion  0.833  0.167
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##       30        0        2 
##                       
## Value          f     m
## Frequency     17    13
## Proportion 0.567 0.433
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        4       18    0.997    68.77    13.91    53.75    56.50 
##      .25      .50      .75      .90      .95 
##    58.75    69.00    77.25    85.00    85.75 
## 
## lowest : 37 53 56 57 58, highest: 78 81 85 86 87
##                                                                             
## Value         37    53    56    57    58    61    62    65    67    69    72
## Frequency      1     1     1     2     2     1     1     1     1     3     2
## Proportion 0.038 0.038 0.038 0.077 0.077 0.038 0.038 0.038 0.038 0.115 0.077
##                                                     
## Value         74    75    78    81    85    86    87
## Frequency      2     1     1     2     2     1     1
## Proportion 0.077 0.038 0.038 0.077 0.077 0.038 0.038
## --------------------------------------------------------------------------------
## Stage 
##        n  missing distinct 
##       30        0        2 
##                       
## Value         II   III
## Frequency     10    20
## Proportion 0.333 0.667
## --------------------------------------------------------------------------------
## sidedness 
##        n  missing distinct 
##       27        3        3 
##                                
## Value        left rectum  right
## Frequency      13      5      9
## Proportion  0.481  0.185  0.333
## --------------------------------------------------------------------------------
a <- na.omit(clinical_Pm$Age)
mean(a)
## [1] 68.76923
  • Statistic differences between patients groups regarding several variables
clinical_factor <- clinic1_bal
clinical_factor <- clinical_factor %>%
  mutate_if(sapply(clinical_factor, is.character), as.factor)
#clinical_factor <- na.omit(clinical_factor)

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  organ), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  Sex), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

clinical_factor1 <- clinical_factor %>%
  filter(!str_detect(sidedness, "rectum"))

ggplot(data = clinical_factor1) + 
  geom_bar(mapping = aes(x = class, fill =  sidedness), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4"))

ggplot(data = clinical_factor) + 
  geom_bar(mapping = aes(x = class, fill =  Stage), position = "fill") + scale_fill_manual(values=c("#ADD8E6", "#4682B4", "#000080"))

clinical_factor %>%
  ggplot( aes(x=Age, fill=class)) +
    geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
    scale_fill_manual(values=c("#69b3a2", "#404080")) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing non-finite values (stat_bin).

    labs(fill="")
## $fill
## [1] ""
## 
## attr(,"class")
## [1] "labels"
stat_data_organ <- table(clinical_factor$class,clinical_factor$organ)
#fazer plot(...)
fisher.test(stat_data_organ)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_organ
## p-value = 0.7153
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.2495643 10.4647177
## sample estimates:
## odds ratio 
##   1.456644
stat_data_sex <- table(clinical_factor$class,clinical_factor$Sex)
fisher.test(stat_data_sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_sex
## p-value = 0.7899
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2504095 2.7417911
## sample estimates:
## odds ratio 
##  0.8312827
stat_data_stage <- table(clinical_factor$class,clinical_factor$Stage)
fisher.test(stat_data_stage)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_stage
## p-value = 0.109
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7476062 8.7792316
## sample estimates:
## odds ratio 
##   2.500865
stat_data_side <- table(clinical_factor$class,clinical_factor$sidedness)
fisher.test(stat_data_side)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  stat_data_side
## p-value = 0.04474
## alternative hypothesis: two.sided
hist(clinical_factor$Age[clinical_factor$class=="P"])

hist(clinical_factor$Age[clinical_factor$class=="Pm"])

tapply(clinical_factor$Age,clinical_factor$class, summary)
## $P
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   39.00   62.50   73.50   70.36   77.75   85.00       3 
## 
## $Pm
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   37.00   58.75   69.00   68.77   77.25   87.00       4
t.test(Age ~ class, clinical_factor)
## 
##  Welch Two Sample t-test
## 
## data:  Age by class
## t = 0.46709, df = 45.47, p-value = 0.6427
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.278678  8.467489
## sample estimates:
##  mean in group P mean in group Pm 
##         70.36364         68.76923

3.3.3 Survival analysis

3.3.3.1 Stage

data <- merge(datasurv1, clinical, by="row.names")

fit <- survfit(Surv(time, Status) ~ Stage, data = data)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ Stage, data = data)
## 
##            n events median 0.95LCL 0.95UCL
## Stage=II  24      9   3728    1816      NA
## Stage=III 31     16   1682     807      NA
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ Stage, data = data)
## 
##                 Stage=II 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     24       1    0.958  0.0408        0.882        1.000
##   400     22       1    0.915  0.0577        0.808        1.000
##   467     21       1    0.871  0.0695        0.745        1.000
##  1357     17       1    0.820  0.0821        0.674        0.998
##  1428     16       1    0.769  0.0916        0.609        0.971
##  1635     15       1    0.717  0.0988        0.548        0.940
##  1816     14       1    0.666  0.1042        0.490        0.905
##  2011     11       1    0.606  0.1109        0.423        0.867
##  3728      5       1    0.485  0.1400        0.275        0.854
## 
##                 Stage=III 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   216     30       1    0.967  0.0328        0.905        1.000
##   386     29       1    0.933  0.0455        0.848        1.000
##   420     28       1    0.900  0.0548        0.799        1.000
##   500     27       1    0.867  0.0621        0.753        0.997
##   518     26       1    0.833  0.0680        0.710        0.978
##   520     25       1    0.800  0.0730        0.669        0.957
##   573     24       1    0.767  0.0772        0.629        0.934
##   632     23       1    0.733  0.0807        0.591        0.910
##   682     22       1    0.700  0.0837        0.554        0.885
##   797     21       1    0.667  0.0861        0.518        0.859
##   807     20       1    0.633  0.0880        0.482        0.832
##   857     19       1    0.600  0.0894        0.448        0.804
##  1107     15       1    0.560  0.0920        0.406        0.773
##  1157     12       1    0.513  0.0954        0.357        0.739
##  1682     10       1    0.462  0.0987        0.304        0.702
##  1835      9       1    0.411  0.1002        0.255        0.663
# Access to the sort summary table
summary(fit)$table
##           records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## Stage=II       24    24      24      9 3135.733   361.0049   3728    1816
## Stage=III      31    31      31     16 2371.863   361.9438   1682     807
##           0.95UCL
## Stage=II       NA
## Stage=III      NA
ggsurvplot(fit, data = data, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ Stage, data = data)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ Stage, data = data)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## Stage=II  24        9     12.8      1.12      2.36
## Stage=III 31       16     12.2      1.17      2.36
## 
##  Chisq= 2.4  on 1 degrees of freedom, p= 0.1

3.3.3.2 Class - PM vs. P

fit <- survfit(Surv(time, Status) ~ class, data = data)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ class, data = data)
## 
##           n events median 0.95LCL 0.95UCL
## class=P  25      3     NA    3728      NA
## class=Pm 30     22   1107     682    2011
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ class, data = data)
## 
##                 class=P 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   467     23       1    0.957  0.0425        0.877            1
##  1816     14       1    0.888  0.0768        0.750            1
##  3728      5       1    0.711  0.1703        0.444            1
## 
##                 class=Pm 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     30       1    0.967  0.0328        0.905        1.000
##   216     29       1    0.933  0.0455        0.848        1.000
##   386     28       1    0.900  0.0548        0.799        1.000
##   400     27       1    0.867  0.0621        0.753        0.997
##   420     26       1    0.833  0.0680        0.710        0.978
##   500     25       1    0.800  0.0730        0.669        0.957
##   518     24       1    0.767  0.0772        0.629        0.934
##   520     23       1    0.733  0.0807        0.591        0.910
##   573     22       1    0.700  0.0837        0.554        0.885
##   632     21       1    0.667  0.0861        0.518        0.859
##   682     20       1    0.633  0.0880        0.482        0.832
##   797     19       1    0.600  0.0894        0.448        0.804
##   807     18       1    0.567  0.0905        0.414        0.775
##   857     17       1    0.533  0.0911        0.382        0.745
##  1107     15       1    0.498  0.0917        0.347        0.714
##  1157     14       1    0.462  0.0918        0.313        0.682
##  1357     13       1    0.427  0.0913        0.280        0.649
##  1428     12       1    0.391  0.0904        0.249        0.615
##  1635     11       1    0.356  0.0889        0.218        0.580
##  1682     10       1    0.320  0.0868        0.188        0.545
##  1835      9       1    0.284  0.0841        0.159        0.508
##  2011      7       1    0.244  0.0813        0.127        0.469
# Access to the sort summary table
summary(fit)$table
##          records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## class=P       25    25      25      3 4029.716   264.4579     NA    3728
## class=Pm      30    30      30     22 1799.706   302.9623   1107     682
##          0.95UCL
## class=P       NA
## class=Pm    2011
ggsurvplot(fit, data = data, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ class, data = data)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ class, data = data)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## class=P  25        3     13.1      7.82      16.7
## class=Pm 30       22     11.9      8.66      16.7
## 
##  Chisq= 16.7  on 1 degrees of freedom, p= 4e-05

3.3.3.3 Sidedness

newdata <- data[-which(data$sidedness == "rectum"),]
fit <- survfit(Surv(time, Status) ~ sidedness, data = newdata)
print(fit)
## Call: survfit(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
##    4 observations deleted due to missingness 
##                  n events median 0.95LCL 0.95UCL
## sidedness=left  17      9   1428     682      NA
## sidedness=right 24      9   2011    1635      NA
# Summary of survival curves
summary(fit)
## Call: survfit(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
## 4 observations deleted due to missingness 
##                 sidedness=left 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   400     16       1    0.938  0.0605        0.826        1.000
##   467     15       1    0.875  0.0827        0.727        1.000
##   520     14       1    0.812  0.0976        0.642        1.000
##   632     13       1    0.750  0.1083        0.565        0.995
##   682     12       1    0.688  0.1159        0.494        0.957
##   807     11       1    0.625  0.1210        0.428        0.914
##  1107      9       1    0.556  0.1259        0.356        0.866
##  1428      8       1    0.486  0.1279        0.290        0.814
##  1835      7       1    0.417  0.1271        0.229        0.758
## 
##                 sidedness=right 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   173     24       1    0.958  0.0408        0.882        1.000
##   420     22       1    0.915  0.0577        0.808        1.000
##   500     21       1    0.871  0.0695        0.745        1.000
##   797     19       1    0.825  0.0795        0.683        0.997
##   857     18       1    0.780  0.0873        0.626        0.971
##  1157     11       1    0.709  0.1042        0.531        0.945
##  1635      9       1    0.630  0.1187        0.435        0.911
##  1816      8       1    0.551  0.1273        0.350        0.867
##  2011      5       1    0.441  0.1418        0.235        0.828
# Access to the sort summary table
summary(fit)$table
##                 records n.max n.start events   *rmean *se(rmean) median 0.95LCL
## sidedness=left       17    17      17      9 2412.722   467.4406   1428     682
## sidedness=right      24    24      24      9 2698.588   443.2539   2011    1635
##                 0.95UCL
## sidedness=left       NA
## sidedness=right      NA
ggsurvplot(fit, data = newdata, pval = TRUE)

# the log/rank test
surv.stage <- survdiff(Surv(time,Status) ~ sidedness, data = newdata)
surv.stage
## Call:
## survdiff(formula = Surv(time, Status) ~ sidedness, data = newdata)
## 
## n=41, 4 observations deleted due to missingness.
## 
##                  N Observed Expected (O-E)^2/E (O-E)^2/V
## sidedness=left  17        9     7.53     0.289       0.5
## sidedness=right 24        9    10.47     0.207       0.5
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5

3.3.4 DEGs

xdata <- rnaseq1[ order(row.names(rnaseq1)), ]
rownames(DATASET1_bal) <- DATASET1_bal$ID
ydata <- as.data.frame(DATASET1_bal[ order(row.names(DATASET1_bal)), ])
rownames(ydata) <- ydata$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata),]
ydata <- as.data.frame(ydata[rownames(ydata) %in% 
                         rownames(xdata),])

# keep features with standard deviation > 0
xdata <- xdata[,sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])}) != 0]
xdata <- t(xdata)

group <- as.factor(ydata$class)
class <- as.data.frame(ydata$class)
edgeR.DGElist <- DGEList(counts=xdata, group = group)

# remove genes that do not have one count per million in at least 5 samples
keep <- rowSums(cpm(edgeR.DGElist) >= 1) >= 5
edgeR.DGElist <- edgeR.DGElist[keep ,]

# specify the design setup 

design <- model.matrix(~group)

# estimate the dispersion for all read counts across all samples
edgeR.DGElist <- estimateDisp(edgeR.DGElist, design)

# fit the negative binomial model
edger_fit <- glmFit(edgeR.DGElist, design )

# perform the testing for every gene using the neg. binomial model
edger_lrt <- glmLRT(edger_fit)
summary(decideTests(edger_lrt))
##        groupPm
## Down        99
## NotSig   20096
## Up          39
# extract results from edger _lrt$ table plus adjusted p- values
DGE.results_edgeR <- topTags(edger_lrt, n = Inf , sort.by = "PValue" , adjust.method = "BH" )

topTags(DGE.results_edgeR) #table with the top10 DEGs
## Coefficient:  groupPm 
##               logFC      logCPM       LR       PValue          FDR
## SPOCK2    -3.402797  3.69918001 38.67871 4.996291e-10 1.010949e-05
## LGR6      -2.025917  2.03265558 32.84611 9.975057e-09 8.069582e-05
## ORM1       5.389676  1.39204222 32.23599 1.365384e-08 8.069582e-05
## H2BS1      6.950155 -2.01960359 31.93370 1.595252e-08 8.069582e-05
## MTRNR2L12 -3.810406  4.36430059 30.37005 3.569974e-08 1.444697e-04
## NELL2     -3.087534  1.88688828 29.37419 5.966713e-08 1.780755e-04
## CXCL11    -3.012100  2.06930721 29.31223 6.160565e-08 1.780755e-04
## IDO1      -2.451621  2.14732736 28.91556 7.560303e-08 1.912190e-04
## SLCO1B1   -4.505595  0.04310525 27.70892 1.410109e-07 3.170239e-04
## SLCO1B7   -3.255436  1.08014684 27.13707 1.895286e-07 3.834922e-04
genes_deg <- DGE.results_edgeR$table
genes_deg <- genes_deg[which(genes_deg$FDR < 0.05),]
dim(genes_deg)#genes found to be differentially expressed
## [1] 138   5
genes_deg$row <- row.names(genes_deg)
write_xlsx(genes_deg,"genes_deg_d3.xlsx")

top100_deg <- rownames(genes_deg[1:100,])
# highly expressed genes
high <- genes_deg[genes_deg$logFC > 0,]
dim(high)
## [1] 39  6
top100_high <- rownames(high[1:100,])

#low expressed genes
low <- genes_deg[genes_deg$logFC < 0,]
dim(low)
## [1] 99  6
top100_low <- rownames(low[1:100,])

3.3.5 Classification

  • In this work we tested 3 different types of classification: 1) Classification without regularization based on DEGs; 2) Classification with regularization (EN and iTwiner); 3) Classification without regularization based on genes selected by regularized logistic regression;
xdata.raw <- xdataT

# keep features with standard deviation > 0
xdata <- xdata.raw[,sapply(seq(ncol(xdata.raw)), function(ix) {sd(xdata.raw[,ix])}) != 0]

ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

# ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
ydata$`clinic1$class` <- as.numeric(ydata$`clinic1$class`)

3.3.5.1 1) Classification without regularization based on DEGs

Five classifiers were used: Decision trees, linear and radial support vector machines, logistic regression and random forest

  • Lets pick up the 50 deferentially expressed genes with lowest pvalue found above
xdata <- xdataT[,top100_deg[1:50]]

nomesgenes <- colnames(xdata)
colnames(xdata) <- paste0("Var", 1:50)
colnames(ydata) <- c("class","row")
xdata$type <- as.factor(ydata$class)

#xdata <- xdata[colMeans(xdata == 0) <= 0.6] #delete genes that have null values in at least 60% of the samples
  • Test the different classifiers 100 times to obtain median values for the measures of model performance such as accuracy, misclassification, sensitivity and specificity, among others
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.3.5.1.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_train[1,]
hist(acc_trees)

mean(acc_trees)
## [1] 0.9915789
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01393505
acc_svm <- acc_train[2,]
hist(acc_svm)

mean(acc_svm)
## [1] 0.8239474
median(acc_svm)
## [1] 0.8157895
sd(acc_svm)
## [1] 0.03339099
acc_svmR <- acc_train[3,]
hist(acc_svmR)

mean(acc_svmR)
## [1] 0.8431579
median(acc_svmR)
## [1] 0.8947368
sd(acc_svmR)
## [1] 0.1519306
acc_logs <- acc_train[4,]
hist(acc_logs)

mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_train[5,]
hist(acc_rf)

mean(acc_rf)
## [1] 1
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0
  • auc
# auc
auc_trees <- auc_train[1,]
mean(auc_trees)
## [1] 0.9918768
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01364299
auc_svm <- auc_train[2,]
mean(auc_svm)
## [1] 0.8077171
median(auc_svm)
## [1] 0.7941176
sd(auc_svm)
## [1] 0.04325313
auc_svmR <- auc_train[3,]
mean(auc_svmR)
## [1] 0.8316527
median(auc_svmR)
## [1] 0.8991597
sd(auc_svmR)
## [1] 0.1722223
auc_logs <- auc_train[4,]
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_train[5,]
mean(auc_rf)
## [1] 1
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0
  • Misclassification
# miscl
miscl_trees <- miscl_train[1,]
mean(miscl_trees)
## [1] 0.32
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.5295319
miscl_svm <- miscl_train[2,]
mean(miscl_svm)
## [1] 6.69
median(miscl_svm)
## [1] 7
sd(miscl_svm)
## [1] 1.268858
miscl_svmR <- miscl_train[3,]
mean(miscl_svmR)
## [1] 5.96
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 5.773363
miscl_logs <- miscl_train[4,]
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_train[5,]
mean(miscl_rf)
## [1] 0
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_train[1,]
mean(sensitivity_trees)
## [1] 0.9947059
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.01691903
sensitivity_svm <- sensitivity_train[2,]
mean(sensitivity_svm)
## [1] 0.6535294
median(sensitivity_svm)
## [1] 0.5882353
sd(sensitivity_svm)
## [1] 0.1579497
sensitivity_svmR <- sensitivity_train[3,]
mean(sensitivity_svmR)
## [1] 0.7223529
median(sensitivity_svmR)
## [1] 0.9411765
sd(sensitivity_svmR)
## [1] 0.3760582
sensitivity_logs <- sensitivity_train[4,]
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_train[5,]
mean(sensitivity_rf)
## [1] 1
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0
  • Specificity
# specificity
specificity_trees <- specificity_train[1,]
mean(specificity_trees)
## [1] 0.9890476
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.0201406
specificity_svm <- specificity_train[2,]
mean(specificity_svm)
## [1] 0.9619048
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.08694009
specificity_svmR <- specificity_train[3,]
mean(specificity_svmR)
## [1] 0.9409524
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.08402482
specificity_logs <- specificity_train[4,]
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_train[5,]
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_train[1,]
mean(fneg_trees)
## [1] 0.09
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.2876235
fneg_svm <- fneg_train[2,]
mean(fneg_svm)
## [1] 5.89
median(fneg_svm)
## [1] 7
sd(fneg_svm)
## [1] 2.685144
fneg_svmR <- fneg_train[3,]
mean(fneg_svmR)
## [1] 4.72
median(fneg_svmR)
## [1] 1
sd(fneg_svmR)
## [1] 6.392989
fneg_logs <- fneg_train[4,]
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_train[5,]
mean(fneg_rf)
## [1] 0
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0
3.3.5.1.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc[1,]
hist(acc_trees)

mean(acc_trees)
## [1] 0.6
median(acc_trees)
## [1] 0.5882353
sd(acc_trees)
## [1] 0.1010241
acc_svm <- acc[2,]
hist(acc_svm)

mean(acc_svm)
## [1] 0.6864706
median(acc_svm)
## [1] 0.7058824
sd(acc_svm)
## [1] 0.08321214
acc_svmR <- acc[3,]
hist(acc_svmR)

mean(acc_svmR)
## [1] 0.5411765
median(acc_svmR)
## [1] 0.5294118
sd(acc_svmR)
## [1] 0.09043601
acc_logs <- acc[4,]
hist(acc_logs)

mean(acc_logs)
## [1] 0.6652941
median(acc_logs)
## [1] 0.6470588
sd(acc_logs)
## [1] 0.1049923
acc_rf <- acc[5,]
hist(acc_rf)

mean(acc_rf)
## [1] 0.6805882
median(acc_rf)
## [1] 0.7058824
sd(acc_rf)
## [1] 0.1018013
  • auc
# auc
auc_trees <- auc[1,]
mean(auc_trees)
## [1] 0.6041667
median(auc_trees)
## [1] 0.5868056
sd(auc_trees)
## [1] 0.09547798
auc_svm <- auc[2,]
mean(auc_svm)
## [1] 0.6746528
median(auc_svm)
## [1] 0.6875
sd(auc_svm)
## [1] 0.08828012
auc_svmR <- auc[3,]
mean(auc_svmR)
## [1] 0.5403472
median(auc_svmR)
## [1] 0.5138889
sd(auc_svmR)
## [1] 0.084279
auc_logs <- auc[4,]
mean(auc_logs)
## [1] 0.6639583
median(auc_logs)
## [1] 0.6458333
sd(auc_logs)
## [1] 0.1014728
auc_rf <- auc[5,]
mean(auc_rf)
## [1] 0.6722222
median(auc_rf)
## [1] 0.6944444
sd(auc_rf)
## [1] 0.1026285
  • Misclassification
# miscl
miscl_trees <- miscl[1,]
mean(miscl_trees)
## [1] 6.8
median(miscl_trees)
## [1] 7
sd(miscl_trees)
## [1] 1.717409
miscl_svm <- miscl[2,]
mean(miscl_svm)
## [1] 5.33
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.414606
miscl_svmR <- miscl[3,]
mean(miscl_svmR)
## [1] 7.8
median(miscl_svmR)
## [1] 8
sd(miscl_svmR)
## [1] 1.537412
miscl_logs <- miscl[4,]
mean(miscl_logs)
## [1] 5.69
median(miscl_logs)
## [1] 6
sd(miscl_logs)
## [1] 1.784869
miscl_rf <- miscl[5,]
mean(miscl_rf)
## [1] 5.43
median(miscl_rf)
## [1] 5
sd(miscl_rf)
## [1] 1.730621
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity[1,]
mean(sensitivity_trees)
## [1] 0.59
median(sensitivity_trees)
## [1] 0.625
sd(sensitivity_trees)
## [1] 0.1714098
sensitivity_svm <- sensitivity[2,]
mean(sensitivity_svm)
## [1] 0.47375
median(sensitivity_svm)
## [1] 0.5
sd(sensitivity_svm)
## [1] 0.2270355
sensitivity_svmR <- sensitivity[3,]
mean(sensitivity_svmR)
## [1] 0.26125
median(sensitivity_svmR)
## [1] 0.25
sd(sensitivity_svmR)
## [1] 0.2248
sensitivity_logs <- sensitivity[4,]
mean(sensitivity_logs)
## [1] 0.61125
median(sensitivity_logs)
## [1] 0.625
sd(sensitivity_logs)
## [1] 0.1784606
sensitivity_rf <- sensitivity[5,]
mean(sensitivity_rf)
## [1] 0.53
median(sensitivity_rf)
## [1] 0.5
sd(sensitivity_rf)
## [1] 0.1812903
  • Specificity
# specificity
specificity_trees <- specificity[1,]
mean(specificity_trees)
## [1] 0.6088889
median(specificity_trees)
## [1] 0.6666667
sd(specificity_trees)
## [1] 0.1761995
specificity_svm <- specificity[2,]
mean(specificity_svm)
## [1] 0.8755556
median(specificity_svm)
## [1] 0.8888889
sd(specificity_svm)
## [1] 0.1370234
specificity_svmR <- specificity[3,]
mean(specificity_svmR)
## [1] 0.79
median(specificity_svmR)
## [1] 0.8888889
sd(specificity_svmR)
## [1] 0.2210378
specificity_logs <- specificity[4,]
mean(specificity_logs)
## [1] 0.7133333
median(specificity_logs)
## [1] 0.7222222
sd(specificity_logs)
## [1] 0.1876469
specificity_rf <- specificity[5,]
mean(specificity_rf)
## [1] 0.8144444
median(specificity_rf)
## [1] 0.8888889
sd(specificity_rf)
## [1] 0.1579699
  • False Neg
# fneg
fneg_trees <- fneg[1,]
mean(fneg_trees)
## [1] 3.28
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.371278
fneg_svm <- fneg[2,]
mean(fneg_svm)
## [1] 4.21
median(fneg_svm)
## [1] 4
sd(fneg_svm)
## [1] 1.816284
fneg_svmR <- fneg[3,]
mean(fneg_svmR)
## [1] 5.91
median(fneg_svmR)
## [1] 6
sd(fneg_svmR)
## [1] 1.7984
fneg_logs <- fneg[4,]
mean(fneg_logs)
## [1] 3.11
median(fneg_logs)
## [1] 3
sd(fneg_logs)
## [1] 1.427684
fneg_rf <- fneg[5,]
mean(fneg_rf)
## [1] 3.76
median(fneg_rf)
## [1] 4
sd(fneg_rf)
## [1] 1.450322

3.3.5.2 2) Classification with regularization (EN and iTwiner)

xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID
# 
# # ydata.raw$class <- c(rep(0,28),rep(1,34))
#  
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw)
ydata$`clinic1$class` <- as.numeric(ydata$`clinic1$class`)
## [1] 0
## [1] 1
## [1] 1011
## [1] "data"
## [1] "en pred ups"
## [1] "EN"
## [1] 1
## [1] 2
## [1] 1022
## [1] "data"
## [1] "EN"
## [1] 2
## [1] 3
## [1] 1033
## [1] "data"
## [1] "EN"
## [1] 3
## [1] 4
## [1] 1044
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 3
## [1] 5
## [1] 1055
## [1] "data"
## [1] "EN"
## [1] 4
## [1] 6
## [1] 1066
## [1] "data"
## [1] "EN"
## [1] 5
## [1] 7
## [1] 1077
## [1] "data"
## [1] "EN"
## [1] 6
## [1] 8
## [1] 1088
## [1] "data"
## [1] "EN"
## [1] 7
## [1] 9
## [1] 1099
## [1] "data"
## [1] "EN"
## [1] 8
## [1] 10
## [1] 1110
## [1] "data"
## [1] "EN"
## [1] 9
## [1] 11
## [1] 1121
## [1] "data"
## [1] "EN"
## [1] 10
## [1] 12
## [1] 1132
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 10
## [1] 13
## [1] 1143
## [1] "data"
## [1] "EN"
## [1] 11
## [1] 14
## [1] 1154
## [1] "data"
## [1] "EN"
## [1] 12
## [1] 15
## [1] 1165
## [1] "data"
## [1] "EN"
## [1] 13
## [1] 16
## [1] 1176
## [1] "data"
## [1] "en pred ups"
## [1] "EN"
## [1] 14
## [1] 17
## [1] 1187
## [1] "data"
## [1] "EN"
## [1] 15
## [1] 18
## [1] 1198
## [1] "data"
## [1] "EN"
## [1] 16
## [1] 19
## [1] 1209
## [1] "data"
## [1] "EN"
## [1] 17
## [1] 20
## [1] 1220
## [1] "data"
## [1] "EN"
## [1] 18
## [1] 21
## [1] 1231
## [1] "data"
## [1] "en train ups"
## [1] "en pred ups"
## [1] "EN"
## [1] 18
## [1] 22
## [1] 1242
## [1] "data"
## [1] "EN"
## [1] 19
## [1] 23
## [1] 1253
## [1] "data"
## [1] "EN"
## [1] 20
## [1] 24
## [1] 1264
## [1] "data"
## [1] "EN"
## [1] 21
## [1] 25
## [1] 1275
## [1] "data"
## [1] "EN"
## [1] 22
## [1] 26
## [1] 1286
## [1] "data"
## [1] "EN"
## [1] 23
## [1] 27
## [1] 1297
## [1] "data"
## [1] "EN"
## [1] 24
## [1] 28
## [1] 1308
## [1] "data"
## [1] "EN"
## [1] 25
## [1] 29
## [1] 1319
## [1] "data"
## [1] "EN"
## [1] 26
## [1] 30
## [1] 1330
## [1] "data"
## [1] "EN"
## [1] 27
## [1] 31
## [1] 1341
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 27
## [1] 32
## [1] 1352
## [1] "data"
## [1] "EN"
## [1] 28
## [1] 33
## [1] 1363
## [1] "data"
## [1] "EN"
## [1] 29
## [1] 34
## [1] 1374
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 29
## [1] 35
## [1] 1385
## [1] "data"
## [1] "EN"
## [1] 30
## [1] 36
## [1] 1396
## [1] "data"
## [1] "EN"
## [1] 31
## [1] 37
## [1] 1407
## [1] "data"
## [1] "EN"
## [1] 32
## [1] 38
## [1] 1418
## [1] "data"
## [1] "EN"
## [1] 33
## [1] 39
## [1] 1429
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 33
## [1] 40
## [1] 1440
## [1] "data"
## [1] "en train ups"
## [1] "en pred ups"
## [1] "EN"
## [1] 33
## [1] 41
## [1] 1451
## [1] "data"
## [1] "EN"
## [1] 34
## [1] 42
## [1] 1462
## [1] "data"
## [1] "en pred ups"
## [1] "EN"
## [1] 35
## [1] 43
## [1] 1473
## [1] "data"
## [1] "EN"
## [1] 36
## [1] 44
## [1] 1484
## [1] "data"
## [1] "EN"
## [1] 37
## [1] 45
## [1] 1495
## [1] "data"
## [1] "EN"
## [1] 38
## [1] 46
## [1] 1506
## [1] "data"
## [1] "EN"
## [1] 39
## [1] 47
## [1] 1517
## [1] "data"
## [1] "en pred ups"
## [1] "EN"
## [1] 40
## [1] 48
## [1] 1528
## [1] "data"
## [1] "EN"
## [1] 41
## [1] 49
## [1] 1539
## [1] "data"
## [1] "EN"
## [1] 42
## [1] 50
## [1] 1550
## [1] "data"
## [1] "EN"
## [1] 43
## [1] 51
## [1] 1561
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 43
## [1] 52
## [1] 1572
## [1] "data"
## [1] "EN"
## [1] 44
## [1] 53
## [1] 1583
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 44
## [1] 54
## [1] 1594
## [1] "data"
## [1] "en train ups"
## [1] "en pred ups"
## [1] "EN"
## [1] 44
## [1] 55
## [1] 1605
## [1] "data"
## [1] "EN"
## [1] 45
## [1] 56
## [1] 1616
## [1] "data"
## [1] "EN"
## [1] 46
## [1] 57
## [1] 1627
## [1] "data"
## [1] "EN"
## [1] 47
## [1] 58
## [1] 1638
## [1] "data"
## [1] "en pred ups"
## [1] "EN"
## [1] 48
## [1] 59
## [1] 1649
## [1] "data"
## [1] "EN"
## [1] 49
## [1] 60
## [1] 1660
## [1] "data"
## [1] "EN"
## [1] 50
## [1] 61
## [1] 1671
## [1] "data"
## [1] "EN"
## [1] 51
## [1] 62
## [1] 1682
## [1] "data"
## [1] "EN"
## [1] 52
## [1] 63
## [1] 1693
## [1] "data"
## [1] "EN"
## [1] 53
## [1] 64
## [1] 1704
## [1] "data"
## [1] "EN"
## [1] 54
## [1] 65
## [1] 1715
## [1] "data"
## [1] "EN"
## [1] 55
## [1] 66
## [1] 1726
## [1] "data"
## [1] "EN"
## [1] 56
## [1] 67
## [1] 1737
## [1] "data"
## [1] "EN"
## [1] 57
## [1] 68
## [1] 1748
## [1] "data"
## [1] "EN"
## [1] 58
## [1] 69
## [1] 1759
## [1] "data"
## [1] "EN"
## [1] 59
## [1] 70
## [1] 1770
## [1] "data"
## [1] "EN"
## [1] 60
## [1] 71
## [1] 1781
## [1] "data"
## [1] "EN"
## [1] 61
## [1] 72
## [1] 1792
## [1] "data"
## [1] "EN"
## [1] 62
## [1] 73
## [1] 1803
## [1] "data"
## [1] "EN"
## [1] 63
## [1] 74
## [1] 1814
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 63
## [1] 75
## [1] 1825
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 63
## [1] 76
## [1] 1836
## [1] "data"
## [1] "EN"
## [1] 64
## [1] 77
## [1] 1847
## [1] "data"
## [1] "EN"
## [1] 65
## [1] 78
## [1] 1858
## [1] "data"
## [1] "EN"
## [1] 66
## [1] 79
## [1] 1869
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 66
## [1] 80
## [1] 1880
## [1] "data"
## [1] "EN"
## [1] 67
## [1] 81
## [1] 1891
## [1] "data"
## [1] "EN"
## [1] 68
## [1] 82
## [1] 1902
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 68
## [1] 83
## [1] 1913
## [1] "data"
## [1] "EN"
## [1] 69
## [1] 84
## [1] 1924
## [1] "data"
## [1] "EN"
## [1] 70
## [1] 85
## [1] 1935
## [1] "data"
## [1] "EN"
## [1] 71
## [1] 86
## [1] 1946
## [1] "data"
## [1] "EN"
## [1] 72
## [1] 87
## [1] 1957
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 72
## [1] 88
## [1] 1968
## [1] "data"
## [1] "EN"
## [1] 73
## [1] 89
## [1] 1979
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 73
## [1] 90
## [1] 1990
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 73
## [1] 91
## [1] 2001
## [1] "data"
## [1] "EN"
## [1] 74
## [1] 92
## [1] 2012
## [1] "data"
## [1] "EN"
## [1] 75
## [1] 93
## [1] 2023
## [1] "data"
## [1] "EN"
## [1] 76
## [1] 94
## [1] 2034
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 76
## [1] 95
## [1] 2045
## [1] "data"
## [1] "EN"
## [1] 77
## [1] 96
## [1] 2056
## [1] "data"
## [1] "EN"
## [1] 78
## [1] 97
## [1] 2067
## [1] "data"
## [1] "EN"
## [1] 79
## [1] 98
## [1] 2078
## [1] "data"
## [1] "EN"
## [1] 80
## [1] 99
## [1] 2089
## [1] "data"
## [1] "EN"
## [1] 81
## [1] 100
## [1] 2100
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 81
## [1] 101
## [1] 2111
## [1] "data"
## [1] "EN"
## [1] 82
## [1] 102
## [1] 2122
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 82
## [1] 103
## [1] 2133
## [1] "data"
## [1] "EN"
## [1] 83
## [1] 104
## [1] 2144
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 83
## [1] 105
## [1] 2155
## [1] "data"
## [1] "EN"
## [1] 84
## [1] 106
## [1] 2166
## [1] "data"
## [1] "EN"
## [1] 85
## [1] 107
## [1] 2177
## [1] "data"
## [1] "EN"
## [1] 86
## [1] 108
## [1] 2188
## [1] "data"
## [1] "EN"
## [1] 87
## [1] 109
## [1] 2199
## [1] "data"
## [1] "EN"
## [1] 88
## [1] 110
## [1] 2210
## [1] "data"
## [1] "EN"
## [1] 89
## [1] 111
## [1] 2221
## [1] "data"
## [1] "EN"
## [1] 90
## [1] 112
## [1] 2232
## [1] "data"
## [1] "EN"
## [1] 91
## [1] 113
## [1] 2243
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 91
## [1] 114
## [1] 2254
## [1] "data"
## [1] "EN"
## [1] 92
## [1] 115
## [1] 2265
## [1] "data"
## [1] "EN não selecionou variaveis"
## [1] 92
## [1] 116
## [1] 2276
## [1] "data"
## [1] "EN"
## [1] 93
## [1] 117
## [1] 2287
## [1] "data"
## [1] "en pred ups"
## [1] "EN"
## [1] 94
## [1] 118
## [1] 2298
## [1] "data"
## [1] "EN"
## [1] 95
## [1] 119
## [1] 2309
## [1] "data"
## [1] "EN"
## [1] 96
## [1] 120
## [1] 2320
## [1] "data"
## [1] "EN"
## [1] 97
## [1] 121
## [1] 2331
## [1] "data"
## [1] "EN"
## [1] 98
## [1] 122
## [1] 2342
## [1] "data"
## [1] "EN"
## [1] 99
## [1] 123
## [1] 2353
## [1] "data"
## [1] "EN"
## [1] 1
## [1] 0
## [1] 2364
## [1] "data"
## [1] "iTwiner"
## [1] 2
## [1] 1
## [1] 2375
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 3
## [1] 1
## [1] 2386
## [1] "data"
## [1] "iTwiner"
## [1] 4
## [1] 2
## [1] 2397
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 5
## [1] 3
## [1] 2408
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 6
## [1] 4
## [1] 2419
## [1] "data"
## [1] "iTwiner"
## [1] 7
## [1] 5
## [1] 2430
## [1] "data"
## [1] "iTwiner"
## [1] 8
## [1] 6
## [1] 2441
## [1] "data"
## [1] "iTwiner"
## [1] 9
## [1] 7
## [1] 2452
## [1] "data"
## [1] "iTwiner"
## [1] 10
## [1] 8
## [1] 2463
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 11
## [1] 9
## [1] 2474
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 12
## [1] 9
## [1] 2485
## [1] "data"
## [1] "iTwiner"
## [1] 13
## [1] 10
## [1] 2496
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 14
## [1] 10
## [1] 2507
## [1] "data"
## [1] "iTwiner"
## [1] 15
## [1] 11
## [1] 2518
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 16
## [1] 11
## [1] 2529
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 17
## [1] 11
## [1] 2540
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 18
## [1] 12
## [1] 2551
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 19
## [1] 12
## [1] 2562
## [1] "data"
## [1] "iTwiner"
## [1] 20
## [1] 13
## [1] 2573
## [1] "data"
## [1] "iTwiner"
## [1] 21
## [1] 14
## [1] 2584
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 22
## [1] 15
## [1] 2595
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 23
## [1] 15
## [1] 2606
## [1] "data"
## [1] "iTwiner"
## [1] 24
## [1] 16
## [1] 2617
## [1] "data"
## [1] "iTwiner"
## [1] 25
## [1] 17
## [1] 2628
## [1] "data"
## [1] "iTwiner"
## [1] 26
## [1] 18
## [1] 2639
## [1] "data"
## [1] "iTwiner"
## [1] 27
## [1] 19
## [1] 2650
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 28
## [1] 19
## [1] 2661
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 29
## [1] 20
## [1] 2672
## [1] "data"
## [1] "iTwiner"
## [1] 30
## [1] 21
## [1] 2683
## [1] "data"
## [1] "iTwiner"
## [1] 31
## [1] 22
## [1] 2694
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 32
## [1] 22
## [1] 2705
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 33
## [1] 22
## [1] 2716
## [1] "data"
## [1] "iTwiner"
## [1] 34
## [1] 23
## [1] 2727
## [1] "data"
## [1] "iTwiner"
## [1] 35
## [1] 24
## [1] 2738
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 36
## [1] 24
## [1] 2749
## [1] "data"
## [1] "iTwiner"
## [1] 37
## [1] 25
## [1] 2760
## [1] "data"
## [1] "iTwiner"
## [1] 38
## [1] 26
## [1] 2771
## [1] "data"
## [1] "iTwiner"
## [1] 39
## [1] 27
## [1] 2782
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 40
## [1] 27
## [1] 2793
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 41
## [1] 27
## [1] 2804
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 42
## [1] 28
## [1] 2815
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 43
## [1] 28
## [1] 2826
## [1] "data"
## [1] "iTwiner"
## [1] 44
## [1] 29
## [1] 2837
## [1] "data"
## [1] "iTwiner"
## [1] 45
## [1] 30
## [1] 2848
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 46
## [1] 30
## [1] 2859
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 47
## [1] 30
## [1] 2870
## [1] "data"
## [1] "iTwiner"
## [1] 48
## [1] 31
## [1] 2881
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 49
## [1] 31
## [1] 2892
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 50
## [1] 31
## [1] 2903
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 51
## [1] 32
## [1] 2914
## [1] "data"
## [1] "iTwiner"
## [1] 52
## [1] 33
## [1] 2925
## [1] "data"
## [1] "iTwiner"
## [1] 53
## [1] 34
## [1] 2936
## [1] "data"
## [1] "iTwiner"
## [1] 54
## [1] 35
## [1] 2947
## [1] "data"
## [1] "iTwiner"
## [1] 55
## [1] 36
## [1] 2958
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 56
## [1] 36
## [1] 2969
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 57
## [1] 37
## [1] 2980
## [1] "data"
## [1] "iTwiner"
## [1] 58
## [1] 38
## [1] 2991
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner"
## [1] 59
## [1] 38
## [1] 3002
## [1] "data"
## [1] "iTwiner"
## [1] 60
## [1] 39
## [1] 3013
## [1] "data"
## [1] "iTwiner"
## [1] 61
## [1] 40
## [1] 3024
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 62
## [1] 41
## [1] 3035
## [1] "data"
## [1] "iTwiner"
## [1] 63
## [1] 42
## [1] 3046
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 64
## [1] 42
## [1] 3057
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 65
## [1] 43
## [1] 3068
## [1] "data"
## [1] "iTwiner"
## [1] 66
## [1] 44
## [1] 3079
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 67
## [1] 44
## [1] 3090
## [1] "data"
## [1] "iTwiner"
## [1] 68
## [1] 45
## [1] 3101
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 69
## [1] 45
## [1] 3112
## [1] "data"
## [1] "iTwiner"
## [1] 70
## [1] 46
## [1] 3123
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 71
## [1] 47
## [1] 3134
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 72
## [1] 47
## [1] 3145
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 73
## [1] 48
## [1] 3156
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 74
## [1] 49
## [1] 3167
## [1] "data"
## [1] "iTwiner"
## [1] 75
## [1] 50
## [1] 3178
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 76
## [1] 50
## [1] 3189
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 77
## [1] 50
## [1] 3200
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 78
## [1] 50
## [1] 3211
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 79
## [1] 51
## [1] 3222
## [1] "data"
## [1] "iTwiner"
## [1] 80
## [1] 52
## [1] 3233
## [1] "data"
## [1] "iTwiner"
## [1] 81
## [1] 53
## [1] 3244
## [1] "data"
## [1] "iTwiner"
## [1] 82
## [1] 54
## [1] 3255
## [1] "data"
## [1] "iTwiner"
## [1] 83
## [1] 55
## [1] 3266
## [1] "data"
## [1] "iTwiner"
## [1] 84
## [1] 56
## [1] 3277
## [1] "data"
## [1] "iTwiner"
## [1] 85
## [1] 57
## [1] 3288
## [1] "data"
## [1] "iTwiner"
## [1] 86
## [1] 58
## [1] 3299
## [1] "data"
## [1] "iTwiner"
## [1] 87
## [1] 59
## [1] 3310
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 88
## [1] 59
## [1] 3321
## [1] "data"
## [1] "iTwiner"
## [1] 89
## [1] 60
## [1] 3332
## [1] "data"
## [1] "iTwiner"
## [1] 90
## [1] 61
## [1] 3343
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 91
## [1] 62
## [1] 3354
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 92
## [1] 63
## [1] 3365
## [1] "data"
## [1] "iTwiner"
## [1] 93
## [1] 64
## [1] 3376
## [1] "data"
## [1] "iTwiner"
## [1] 94
## [1] 65
## [1] 3387
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 95
## [1] 66
## [1] 3398
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 96
## [1] 67
## [1] 3409
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 97
## [1] 67
## [1] 3420
## [1] "data"
## [1] "iTwiner"
## [1] 98
## [1] 68
## [1] 3431
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 99
## [1] 69
## [1] 3442
## [1] "data"
## [1] "iTwiner"
## [1] 100
## [1] 70
## [1] 3453
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 101
## [1] 70
## [1] 3464
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 102
## [1] 70
## [1] 3475
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 103
## [1] 71
## [1] 3486
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 104
## [1] 72
## [1] 3497
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 105
## [1] 73
## [1] 3508
## [1] "data"
## [1] "iTwiner"
## [1] 106
## [1] 74
## [1] 3519
## [1] "data"
## [1] "iTwiner"
## [1] 107
## [1] 75
## [1] 3530
## [1] "data"
## [1] "iTwiner"
## [1] 108
## [1] 76
## [1] 3541
## [1] "data"
## [1] "iTwiner"
## [1] 109
## [1] 77
## [1] 3552
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 110
## [1] 78
## [1] 3563
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 111
## [1] 79
## [1] 3574
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 112
## [1] 80
## [1] 3585
## [1] "data"
## [1] "iTwiner"
## [1] 113
## [1] 81
## [1] 3596
## [1] "data"
## [1] "iTwiner"
## [1] 114
## [1] 82
## [1] 3607
## [1] "data"
## [1] "iTwiner"
## [1] 115
## [1] 83
## [1] 3618
## [1] "data"
## [1] "iTwiner"
## [1] 116
## [1] 84
## [1] 3629
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 117
## [1] 85
## [1] 3640
## [1] "data"
## [1] "iTwiner"
## [1] 118
## [1] 86
## [1] 3651
## [1] "data"
## [1] "iTwiner"
## [1] 119
## [1] 87
## [1] 3662
## [1] "data"
## [1] "iTwiner"
## [1] 120
## [1] 88
## [1] 3673
## [1] "data"
## [1] "iTwiner"
## [1] 121
## [1] 89
## [1] 3684
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 122
## [1] 89
## [1] 3695
## [1] "data"
## [1] "iTwiner train ups"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 123
## [1] 89
## [1] 3706
## [1] "data"
## [1] "iTwiner"
## [1] 124
## [1] 90
## [1] 3717
## [1] "data"
## [1] "iTwiner"
## [1] 125
## [1] 91
## [1] 3728
## [1] "data"
## [1] "iTwiner"
## [1] 126
## [1] 92
## [1] 3739
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 127
## [1] 93
## [1] 3750
## [1] "data"
## [1] "iTwiner"
## [1] 128
## [1] 94
## [1] 3761
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 129
## [1] 95
## [1] 3772
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 130
## [1] 96
## [1] 3783
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 131
## [1] 97
## [1] 3794
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 132
## [1] 98
## [1] 3805
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
## [1] 133
## [1] 99
## [1] 3816
## [1] "data"
## [1] "iTwiner test ups"
## [1] "iTwiner"
3.3.5.2.1 Measures of model performance
runs_en_tr <- which(acc_cox_tr[1,]!=0)
runs_itw_tr <- which(acc_cox_tr[2,]!=0)

runs_en_ts <- which(acc_cox_tes[1,]!=0)
runs_itw_ts <- which(acc_cox_tes[2,]!=0)
  • number variables selected
# median number of variables selected
nvar_en <- nvar_selected_1[1,]
nvar_en <- nvar_en[runs_en_tr]
mean(nvar_en)
## [1] 41.43
median(nvar_en)
## [1] 39
sd(nvar_en)
## [1] 19.76031
nvar_tw <- nvar_selected_1[2,]
nvar_tw <- nvar_tw[runs_itw_tr]
mean(nvar_tw)
## [1] 44.56
median(nvar_tw)
## [1] 38.5
sd(nvar_tw)
## [1] 20.6546
  • Accuracy
# EN train
acc_cox_tr_EN <- acc_cox_tr[1,]
acc_cox_tr_EN <- acc_cox_tr_EN[runs_en_tr]
#hist(acc_cox_tr_EN)
mean(acc_cox_tr_EN)  
## [1] 0.8010526
median(acc_cox_tr_EN)
## [1] 0.7894737
sd(acc_cox_tr_EN)
## [1] 0.0722694
# EN test
acc_cox_tes_EN <- acc_cox_tes[1,]
acc_cox_tes_EN <- acc_cox_tes_EN[runs_en_ts]
#hist(acc_cox_tes_EN)
mean(acc_cox_tes_EN)
## [1] 0.5744681
median(acc_cox_tes_EN)
## [1] 0.5882353
sd(acc_cox_tes_EN)
## [1] 0.07391765
#iTwiner train
acc_cox_tr_iTwiner <- acc_cox_tr[2,]
acc_cox_tr_iTwiner <- acc_cox_tr_iTwiner[runs_itw_tr]
#hist(acc_cox_tr_iTwiner)
mean(acc_cox_tr_iTwiner)
## [1] 0.705
median(acc_cox_tr_iTwiner)
## [1] 0.7105263
sd(acc_cox_tr_iTwiner)
## [1] 0.06588722
#iTwiner test
acc_cox_tes_iTwiner <- acc_cox_tes[2,]
acc_cox_tes_iTwiner <- acc_cox_tes_iTwiner[runs_itw_ts]
#hist(acc_cox_tes_iTwiner)
mean(acc_cox_tes_iTwiner)
## [1] 0.6263463
median(acc_cox_tes_iTwiner)
## [1] 0.6470588
sd(acc_cox_tes_iTwiner)
## [1] 0.04981289
  • miscl
# EN train
miscl_cox_tr_EN <- miscl_cox_tr[1,]
miscl_cox_tr_EN <- miscl_cox_tr_EN[runs_en_tr]
mean(miscl_cox_tr_EN)  
## [1] 7.56
median(miscl_cox_tr_EN)
## [1] 8
sd(miscl_cox_tr_EN)
## [1] 2.746237
# EN test
miscl_cox_tes_EN <- miscl_cox_tes[1,]
miscl_cox_tes_EN <- miscl_cox_tes_EN[runs_en_ts]
mean(miscl_cox_tes_EN)
## [1] 7.234043
median(miscl_cox_tes_EN)
## [1] 7
sd(miscl_cox_tes_EN)
## [1] 1.2566
#iTwiner train
miscl_cox_tr_iTwiner <- miscl_cox_tr[2,]
miscl_cox_tr_iTwiner <- miscl_cox_tr_iTwiner[runs_itw_tr]
mean(miscl_cox_tr_iTwiner)
## [1] 11.21
median(miscl_cox_tr_iTwiner)
## [1] 11
sd(miscl_cox_tr_iTwiner)
## [1] 2.503714
#iTwiner test
miscl_cox_tes_iTwiner <- miscl_cox_tes[2,]
miscl_cox_tes_iTwiner <- miscl_cox_tes_iTwiner[runs_itw_ts]
mean(miscl_cox_tes_iTwiner)
## [1] 6.352113
median(miscl_cox_tes_iTwiner)
## [1] 6
sd(miscl_cox_tes_iTwiner)
## [1] 0.8468191
  • False Neg
#EN
fneg_reg_en_train <- fneg_reg_tr[1,runs_en_tr]
mean(fneg_reg_en_train)
## [1] 7.56
median(fneg_reg_en_train)
## [1] 8
sd(fneg_reg_en_train)
## [1] 2.746237
fneg_reg_en_test <- fneg_reg_ts[1,runs_en_ts]
mean(fneg_reg_en_test)
## [1] 5.787234
median(fneg_reg_en_test)
## [1] 6
sd(fneg_reg_en_test)
## [1] 1.134715
#iTwiner
fneg_reg_iTwiner_train <- fneg_reg_tr[2,runs_itw_tr]
mean(fneg_reg_iTwiner_train)
## [1] 11.21
median(fneg_reg_iTwiner_train)
## [1] 11
sd(fneg_reg_iTwiner_train)
## [1] 2.503714
fneg_reg_iTwiner_test <- fneg_reg_ts[2,runs_itw_ts]
mean(fneg_reg_iTwiner_test)
## [1] 6.239437
median(fneg_reg_iTwiner_test)
## [1] 6
sd(fneg_reg_iTwiner_test)
## [1] 0.7831218
  • sensitivity
# EN train
sensitivity_cox_tr_EN <- sensitivity_cox_tr[1,]
sensitivity_cox_tr_EN <- sensitivity_cox_tr_EN[runs_en_tr]
mean(sensitivity_cox_tr_EN)  
## [1] 0.5552941
median(sensitivity_cox_tr_EN)
## [1] 0.5294118
sd(sensitivity_cox_tr_EN)
## [1] 0.1615434
# EN test
sensitivity_cox_tes_EN <- sensitivity_cox_tes[1,]
sensitivity_cox_tes_EN <- sensitivity_cox_tes_EN[runs_en_ts]
mean(sensitivity_cox_tes_EN)
## [1] 0.2765957
median(sensitivity_cox_tes_EN)
## [1] 0.25
sd(sensitivity_cox_tes_EN)
## [1] 0.1418393
#iTwiner train
sensitivity_cox_tr_iTwiner <- sensitivity_cox_tr[2,]
sensitivity_cox_tr_iTwiner <- sensitivity_cox_tr_iTwiner[runs_itw_tr]
mean(sensitivity_cox_tr_iTwiner)
## [1] 0.3405882
median(sensitivity_cox_tr_iTwiner)
## [1] 0.3529412
sd(sensitivity_cox_tr_iTwiner)
## [1] 0.1472773
#iTwiner test
sensitivity_cox_tes_iTwiner <- sensitivity_cox_tes[2,]
sensitivity_cox_tes_iTwiner <- sensitivity_cox_tes_iTwiner[runs_itw_ts]
mean(sensitivity_cox_tes_iTwiner)
## [1] 0.2200704
median(sensitivity_cox_tes_iTwiner)
## [1] 0.25
sd(sensitivity_cox_tes_iTwiner)
## [1] 0.09789022
  • specificity
# EN train
specificity_cox_tr_EN <- specificity_cox_tr[1,]
specificity_cox_tr_EN <- specificity_cox_tr_EN[runs_en_tr]
mean(specificity_cox_tr_EN)  
## [1] 1
median(specificity_cox_tr_EN)
## [1] 1
sd(specificity_cox_tr_EN)
## [1] 0
# EN test
specificity_cox_tes_EN <- specificity_cox_tes[1,]
specificity_cox_tes_EN <- specificity_cox_tes_EN[runs_en_ts]
mean(specificity_cox_tes_EN)
## [1] 0.8392435
median(specificity_cox_tes_EN)
## [1] 0.8888889
sd(specificity_cox_tes_EN)
## [1] 0.1181936
#iTwiner train
specificity_cox_tr_iTwiner <- specificity_cox_tr[2,]
specificity_cox_tr_iTwiner <- specificity_cox_tr_iTwiner[runs_itw_tr]
mean(specificity_cox_tr_iTwiner)
## [1] 1
median(specificity_cox_tr_iTwiner)
## [1] 1
sd(specificity_cox_tr_iTwiner)
## [1] 0
#iTwiner test
specificity_cox_tes_iTwiner <- specificity_cox_tes[2,]
specificity_cox_tes_iTwiner <- specificity_cox_tes_iTwiner[runs_itw_ts]
mean(specificity_cox_tes_iTwiner)
## [1] 0.9874804
median(specificity_cox_tes_iTwiner)
## [1] 1
sd(specificity_cox_tes_iTwiner)
## [1] 0.04005858
  • auc
# EN train
auc_cox_tr_EN <- auc_cox_tr[1,]
auc_cox_tr_EN <- auc_cox_tr_EN[runs_en_tr]
mean(auc_cox_tr_EN)  
## [1] 0.7776471
median(auc_cox_tr_EN)
## [1] 0.7647059
sd(auc_cox_tr_EN)
## [1] 0.08077168
# EN test
auc_cox_tes_EN <- auc_cox_tes[1,]
auc_cox_tes_EN <- auc_cox_tes_EN[runs_en_ts]
mean(auc_cox_tes_EN)
## [1] 0.5611702
median(auc_cox_tes_EN)
## [1] 0.5694444
sd(auc_cox_tes_EN)
## [1] 0.07220669
#iTwiner train
auc_cox_tr_iTwiner <- auc_cox_tr[2,]
auc_cox_tr_iTwiner <- auc_cox_tr_iTwiner[runs_itw_tr]
mean(auc_cox_tr_iTwiner)
## [1] 0.6702941
median(auc_cox_tr_iTwiner)
## [1] 0.6764706
sd(auc_cox_tr_iTwiner)
## [1] 0.07363866
#iTwiner test
auc_cox_tes_iTwiner <- auc_cox_tes[2,]
auc_cox_tes_iTwiner <- auc_cox_tes_iTwiner[runs_itw_ts]
mean(auc_cox_tes_iTwiner)
## [1] 0.6037754
median(auc_cox_tes_iTwiner)
## [1] 0.625
sd(auc_cox_tes_iTwiner)
## [1] 0.05201897
  • Names of genes selected

Variables always selected

var_selected_alw_select_en <- var_selected_en1[runs_en_ts]
var_selected_alw_select_en1 <-  Reduce(intersect,var_selected_alw_select_en)
print(paste("variables always selected by EN = ",length(var_selected_alw_select_en1)))
## [1] "variables always selected by EN =  0"
var_selected_alw_select_iTwiner <- var_selected_iTwiner1[runs_itw_ts]
var_selected_alw_select_iTwiner1 <-  Reduce(intersect,var_selected_alw_select_iTwiner)
print(paste("variables always selected by iTwiner = ",length(var_selected_alw_select_iTwiner1)))
## [1] "variables always selected by iTwiner =  2"

Variables selected in 50 bootstrap samples

l = length(var_selected_alw_select_en)
var_selected_50_select_en <- table(unlist(var_selected_alw_select_en))
var_selected_50_select_en <- as.data.frame(var_selected_50_select_en)
var_selected_50_select_en <-  subset(var_selected_50_select_en, Freq > 0.50*l)
print(paste("variables selected 50% by EN = ",length(var_selected_50_select_en$Var1)))
## [1] "variables selected 50% by EN =  6"
var_selected_50_select_en$Var1
## [1] IGBP1P2    LRRC37A14P MIR5002    MIR6829    MIR8078    NMU       
## 1014 Levels: ABCF2P2 ABHD16B ABHD17AP6 ACRV1 ACSL6-AS1 ACTBP9 ... ZNHIT1P1
# 
l = length(var_selected_alw_select_iTwiner)
var_selected_50_select_iTwiner <- table(unlist(var_selected_alw_select_iTwiner))
var_selected_50_select_iTwiner <- as.data.frame(var_selected_50_select_iTwiner)
var_selected_50_select_iTwiner <-  subset(var_selected_50_select_iTwiner, Freq > 0.50*l)
print(paste("variables selected 50% by iTwiner = ",length(var_selected_50_select_iTwiner$Var1)))
## [1] "variables selected 50% by iTwiner =  30"
var_selected_50_select_iTwiner$Var1
##  [1] APOOP4      DNTT        EEF1B2P6    H2BC2P      HORMAD2     HSPE1P4    
##  [7] IGHVIII-2-1 IQCF5-AS1   KCTD9P3     KDM4F       LINC01100   LINC02059  
## [13] MIR602      MIR659      MIR8078     NDUFA5P10   NME2P2      NMNAT1P3   
## [19] OPCML-IT2   OR1S2       RAC1P3      RNU6-973P   RPL21P104   RPL23AP26  
## [25] RPL7P12     TDGF1P7     TLR12P      TRAJ49      TRBV11-1    VN2R9P     
## 308 Levels: ABCD1P3 ALOX15P2 ANKRD20A10P AOX3P-AOX2P APOOP4 ... ZDHHC20P2
  • Select the 50 most frequent variables
var_selected_50_select_en <- table(unlist(var_selected_alw_select_en))
var_selected_50_select_en <- as.data.frame(var_selected_50_select_en)
var_selected_50_select_en <- var_selected_50_select_en[order(var_selected_50_select_en$Freq, decreasing = T),]
hist(var_selected_50_select_en$Freq)

top100_en <- var_selected_50_select_en[1:100,]
top100_en <- top100_en$Var1
top50_en <- top100_en[1:50]
top100_en <- as.data.frame(top100_en)
write_xlsx(top100_en,"List_top100_en_d3.xlsx")


var_selected_50_select_iTwiner <- table(unlist(var_selected_alw_select_iTwiner))
var_selected_50_select_iTwiner <- as.data.frame(var_selected_50_select_iTwiner)
var_selected_50_select_iTwiner <- var_selected_50_select_iTwiner[order(var_selected_50_select_iTwiner$Freq, decreasing = T),]
hist(var_selected_50_select_iTwiner$Freq)

top100_itw <- var_selected_50_select_iTwiner[1:100,]
top100_itw <- top100_itw$Var1
top50_itw <- top100_itw[1:50]
top100_itw <- as.data.frame(top100_itw)
write_xlsx(top100_itw,"List_top100_itw_d3.xlsx")

variables in common between EN and iTwiner

common_var_selected_50_en_iTwiner <- var_selected_50_select_iTwiner$Var1[which(var_selected_50_select_iTwiner$Var1 %in% var_selected_50_select_en$Var1)]
length(common_var_selected_50_en_iTwiner)
## [1] 95
common_var_selected_50_en_iTwiner
##  [1] NME2P2       RAC1P3       HORMAD2      OPCML-IT2    LINC01100   
##  [6] NMNAT1P3     MIR602       KDM4F        TRBV11-1     NDUFA5P10   
## [11] APOOP4       IQCF5-AS1    MIR8078      MIR659       H2BC2P      
## [16] RPL7P12      LINC02059    EEF1B2P6     RNU6-973P    TDGF1P7     
## [21] RPL23AP26    HSPE1P4      PPP1R14BP4   XRCC6P2      DYTN        
## [26] MIR8052      RNU6-179P    IGHV3-16     LINC02178    COX5AP1     
## [31] DMD-AS3      MIR6729      RNU7-170P    MIR323B      SNORD42A    
## [36] RNU6-1105P   RNU6-73P     RNU6-902P    SEPTIN2P1    LINC01927   
## [41] MIR376B      MIR609       RNU6-38P     RNU7-193P    MIR5702     
## [46] VWC2L-IT1    HLA-S        RPS6P15      BNIP3P36     MTATP8P1    
## [51] RNU6-851P    MIR4783      RBMY2XP      RNA5SP246    EPPIN-WFDC6 
## [56] MIR4667      RPS27AP20    SNORD31B     MIR7156      RNU6-954P   
## [61] TRAV11       DNAJA1P6     DUTP5        IGHJ1        MIR1243     
## [66] MIR4493      MIR6782      RNA5SP205    RNU4-75P     RNU6-154P   
## [71] RNU6-521P    RPS4XP11     SELENOTP1    SULT6B2P     TDGF1P1     
## [76] ANKRD20A10P  GAPDHP36     IGHV3OR16-13 LINC02726    MIR4317     
## [81] MIR510       MIR587       MIR6083      RNA5SP74     RNU4-13P    
## [86] RNU4-88P     RNU6-1031P   RNU6-1050P   RNU6-115P    RNU6-1293P  
## [91] RNU6-156P    RNU6-603P    RNU7-154P    RPL21P108    RPL5P21     
## 308 Levels: ABCD1P3 ALOX15P2 ANKRD20A10P AOX3P-AOX2P APOOP4 ... ZDHHC20P2
  • Variables selected by EN and iTwiner that are DEGs
xdata <- rnaseq1[ order(row.names(rnaseq1)), ]

nomesgenes <- c(as.vector(top50_en),as.vector(top50_itw))
xdata <- xdata[,nomesgenes]
rownames(DATASET1_bal) <- DATASET1_bal$ID
ydata <- as.data.frame(DATASET1_bal[ order(row.names(DATASET1_bal)), ])
rownames(ydata) <- ydata$ID

xdata <- xdata[rownames(xdata) %in% 
                 rownames(ydata),]
ydata <- as.data.frame(ydata[rownames(ydata) %in% 
                               rownames(xdata),])

# keep features with standard deviation > 0
xdata <- xdata[,sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])}) != 0]
xdata <- t(xdata)

group <- as.factor(ydata$class)
class <- as.data.frame(ydata$class)
edgeR.DGElist <- DGEList(counts=xdata, group = group)

# remove genes that do not have one count per million in at least 5 samples
keep <- rowSums(cpm(edgeR.DGElist) >= 1) >= 5
edgeR.DGElist <- edgeR.DGElist[keep ,]

# specify the design setup 

design <- model.matrix(~group)

# estimate the dispersion for all read counts across all samples
edgeR.DGElist <- estimateDisp(edgeR.DGElist, design)

# fit the negative binomial model
edger_fit <- glmFit(edgeR.DGElist, design )

# perform the testing for every gene using the neg. binomial model
edger_lrt <- glmLRT(edger_fit)
summary(decideTests(edger_lrt))
##        groupPm
## Down        18
## NotSig      67
## Up          15
# extract results from edger _lrt$ table plus adjusted p- values
DGE.results_edgeR <- topTags(edger_lrt, n = Inf , sort.by = "PValue" , adjust.method = "BH" )

topTags(DGE.results_edgeR) #table with the top10 DEGs
## Coefficient:  groupPm 
##               logFC    logCPM       LR       PValue          FDR
## MIR5002    3.743050 11.669347 28.15427 1.120205e-07 1.120205e-05
## MIR4725    2.745083 11.881621 25.33740 4.812863e-07 2.406432e-05
## RNU6-33P   3.157497 13.900931 20.12542 7.252630e-06 2.417543e-04
## OR4K12P    2.690314  9.653652 18.21816 1.969917e-05 4.613801e-04
## MIR8078    2.512305  9.874790 17.57065 2.768280e-05 4.613801e-04
## MIR8078.1  2.512305  9.874790 17.57065 2.768280e-05 4.613801e-04
## MIR5579    2.488928 10.277140 14.68284 1.271988e-04 1.817126e-03
## RNU6-1263P 2.271580  9.975633 13.53384 2.342998e-04 2.581740e-03
## MIR8052    2.048224  9.583558 13.39440 2.523762e-04 2.581740e-03
## HSPE1P4    2.134876  9.581129 13.35181 2.581740e-04 2.581740e-03
genes_deg <- DGE.results_edgeR$table
genes_deg <- genes_deg[which(genes_deg$FDR < 0.05),]
dim(genes_deg)#genes found to be differentially expressed
## [1] 33  5
genes_deg$row <- row.names(genes_deg)
write_xlsx(genes_deg,"genes_deg_enitw_d3.xlsx")

# highly expressed genes
high <- genes_deg[genes_deg$logFC > 0,]
dim(high)
## [1] 15  6
#low expressed genes
low <- genes_deg[genes_deg$logFC < 0,]
dim(low)
## [1] 18  6

3.3.5.3 3) Classification based on genes selected by regularized logistic regression

3.3.5.3.1 EN + Classifiers
xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

#ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
colnames(ydata) <- c("class","id")

names <- as.vector(top50_en)
xdata_en <- as.data.frame(xdata[,names])

nomesgenes <- colnames(xdata_en)
colnames(xdata_en) <- paste0("Var", 1:50)
xdata_en$type <- as.factor(ydata$class)
ydata$class <- as.numeric(ydata$class)
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.3.5.3.1.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_enplus_train[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.9934211
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01261507
acc_svm <- acc_enplus_train[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.9968421
median(acc_svm)
## [1] 1
sd(acc_svm)
## [1] 0.008594701
acc_svmR <- acc_enplus_train[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.9505263
median(acc_svmR)
## [1] 0.9736842
sd(acc_svmR)
## [1] 0.05940941
acc_logs <- acc_enplus_train[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_enplus_train[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 1
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0
  • auc
# auc
auc_trees <- auc_enplus_train[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.9933754
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01277733
auc_svm <- auc_enplus_train[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.9969188
median(auc_svm)
## [1] 1
sd(auc_svm)
## [1] 0.008436307
auc_svmR <- auc_enplus_train[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.9448179
median(auc_svmR)
## [1] 0.9705882
sd(auc_svmR)
## [1] 0.06639741
auc_logs <- auc_enplus_train[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_enplus_train[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 1
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0
  • Misclassification
# miscl
miscl_trees <- miscl_enplus_train[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 0.25
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.4793725
miscl_svm <- miscl_enplus_train[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 0.12
median(miscl_svm)
## [1] 0
sd(miscl_svm)
## [1] 0.3265986
miscl_svmR <- miscl_enplus_train[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 1.88
median(miscl_svmR)
## [1] 1
sd(miscl_svmR)
## [1] 2.257557
miscl_logs <- miscl_enplus_train[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_enplus_train[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 0
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_enplus_train[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.9929412
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.01921168
sensitivity_svm <- sensitivity_enplus_train[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.9976471
median(sensitivity_svm)
## [1] 1
sd(sensitivity_svm)
## [1] 0.01158508
sensitivity_svmR <- sensitivity_enplus_train[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.8905882
median(sensitivity_svmR)
## [1] 0.9411765
sd(sensitivity_svmR)
## [1] 0.1329921
sensitivity_logs <- sensitivity_enplus_train[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_enplus_train[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 1
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0
  • Specificity
# specificity
specificity_trees <- specificity_enplus_train[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.9938095
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.01609513
specificity_svm <- specificity_enplus_train[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9961905
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.01298381
specificity_svmR <- specificity_enplus_train[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.9990476
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.006700252
specificity_logs <- specificity_enplus_train[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_enplus_train[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_enplus_train[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 0.12
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.3265986
fneg_svm <- fneg_enplus_train[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 0.04
median(fneg_svm)
## [1] 0
sd(fneg_svm)
## [1] 0.1969464
fneg_svmR <- fneg_enplus_train[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 1.86
median(fneg_svmR)
## [1] 1
sd(fneg_svmR)
## [1] 2.260866
fneg_logs <- fneg_enplus_train[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_enplus_train[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 0
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0
3.3.5.3.1.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc_enplus[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.6511765
median(acc_trees)
## [1] 0.6470588
sd(acc_trees)
## [1] 0.1189045
acc_svm <- acc_enplus[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.7264706
median(acc_svm)
## [1] 0.7058824
sd(acc_svm)
## [1] 0.09286714
acc_svmR <- acc_enplus[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.7370588
median(acc_svmR)
## [1] 0.7647059
sd(acc_svmR)
## [1] 0.08707095
acc_logs <- acc_enplus[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.7570588
median(acc_logs)
## [1] 0.7647059
sd(acc_logs)
## [1] 0.09030258
acc_rf <- acc_enplus[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.7782353
median(acc_rf)
## [1] 0.7647059
sd(acc_rf)
## [1] 0.0814289
  • auc
# auc
auc_trees <- auc_enplus[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.6635417
median(auc_trees)
## [1] 0.6527778
sd(auc_trees)
## [1] 0.09839943
auc_svm <- auc_enplus[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.7247222
median(auc_svm)
## [1] 0.7118056
sd(auc_svm)
## [1] 0.09112109
auc_svmR <- auc_enplus[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.7351389
median(auc_svmR)
## [1] 0.7569444
sd(auc_svmR)
## [1] 0.08763573
auc_logs <- auc_enplus[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.7547222
median(auc_logs)
## [1] 0.7569444
sd(auc_logs)
## [1] 0.09145386
auc_rf <- auc_enplus[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.769375
median(auc_rf)
## [1] 0.7569444
sd(auc_rf)
## [1] 0.08473783
  • Misclassification
# miscl
miscl_trees <- miscl_enplus[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 5.93
median(miscl_trees)
## [1] 6
sd(miscl_trees)
## [1] 2.021376
miscl_svm <- miscl_enplus[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 4.65
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.578741
miscl_svmR <- miscl_enplus[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 4.47
median(miscl_svmR)
## [1] 4
sd(miscl_svmR)
## [1] 1.480206
miscl_logs <- miscl_enplus[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 4.13
median(miscl_logs)
## [1] 4
sd(miscl_logs)
## [1] 1.535144
miscl_rf <- miscl_enplus[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 3.77
median(miscl_rf)
## [1] 4
sd(miscl_rf)
## [1] 1.384291
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_enplus[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.62125
median(sensitivity_trees)
## [1] 0.625
sd(sensitivity_trees)
## [1] 0.1753919
sensitivity_svm <- sensitivity_enplus[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.665
median(sensitivity_svm)
## [1] 0.625
sd(sensitivity_svm)
## [1] 0.1828161
sensitivity_svmR <- sensitivity_enplus[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.7025
median(sensitivity_svmR)
## [1] 0.75
sd(sensitivity_svmR)
## [1] 0.1711333
sensitivity_logs <- sensitivity_enplus[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.715
median(sensitivity_logs)
## [1] 0.75
sd(sensitivity_logs)
## [1] 0.1629123
sensitivity_rf <- sensitivity_enplus[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.61875
median(sensitivity_rf)
## [1] 0.625
sd(sensitivity_rf)
## [1] 0.1689007
  • Specificity
# specificity
specificity_trees <- specificity_enplus[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.6777778
median(specificity_trees)
## [1] 0.6666667
sd(specificity_trees)
## [1] 0.1719151
specificity_svm <- specificity_enplus[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.7811111
median(specificity_svm)
## [1] 0.7777778
sd(specificity_svm)
## [1] 0.1297066
specificity_svmR <- specificity_enplus[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.7677778
median(specificity_svmR)
## [1] 0.7777778
sd(specificity_svmR)
## [1] 0.1540046
specificity_logs <- specificity_enplus[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.7944444
median(specificity_logs)
## [1] 0.7777778
sd(specificity_logs)
## [1] 0.1334222
specificity_rf <- specificity_enplus[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 0.92
median(specificity_rf)
## [1] 0.8888889
sd(specificity_rf)
## [1] 0.09219443
  • False Neg
# fneg
fneg_trees <- fneg_enplus[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 3.03
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.403135
fneg_svm <- fneg_enplus[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 2.68
median(fneg_svm)
## [1] 3
sd(fneg_svm)
## [1] 1.462529
fneg_svmR <- fneg_enplus[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 2.38
median(fneg_svmR)
## [1] 2
sd(fneg_svmR)
## [1] 1.369067
fneg_logs <- fneg_enplus[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 2.28
median(fneg_logs)
## [1] 2
sd(fneg_logs)
## [1] 1.303298
fneg_rf <- fneg_enplus[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 3.05
median(fneg_rf)
## [1] 3
sd(fneg_rf)
## [1] 1.351206
3.3.5.3.2 iTwiner + classifiers
xdata <- xdataT


ydata.raw <- as.data.frame(clinic1$class)
ydata.raw$row <- DATASET1$ID

#ydata.raw$class <- c(rep(0,28),rep(1,34))
 
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "P"] <- 0
ydata.raw$`clinic1$class`[ydata.raw$`clinic1$class` == "Pm"] <- 1
rownames(ydata.raw) <- DATASET1$ID

xdata <- xdata[rownames(xdata) %in% 
                         rownames(ydata.raw),]

ydata.raw <- as.data.frame(ydata.raw[rownames(ydata.raw) %in% 
                         rownames(xdata),])

xdata <- xdata[ order(row.names(xdata)), ]

ydata.raw <- ydata.raw[ order(row.names(ydata.raw)), ]



ydata <- as.data.frame(ydata.raw[,1:2])
colnames(ydata) <- c("class","id")

names <- as.vector(top50_itw)
xdata_iTwiner <- as.data.frame(xdata[,names])

nomesgenes <- colnames(xdata_iTwiner)
colnames(xdata_iTwiner) <- paste0("Var", 1:50)
xdata_iTwiner$type <- as.factor(ydata$class)
ydata$class <- as.numeric(ydata$class)
## [1] 1
## [1] 1011
## [1] 2
## [1] 1022
## [1] 3
## [1] 1033
## [1] 4
## [1] 1044
## [1] 5
## [1] 1055
## [1] 6
## [1] 1066
## [1] 7
## [1] 1077
## [1] 8
## [1] 1088
## [1] 9
## [1] 1099
## [1] 10
## [1] 1110
## [1] 11
## [1] 1121
## [1] 12
## [1] 1132
## [1] 13
## [1] 1143
## [1] 14
## [1] 1154
## [1] 15
## [1] 1165
## [1] 16
## [1] 1176
## [1] 17
## [1] 1187
## [1] 18
## [1] 1198
## [1] 19
## [1] 1209
## [1] 20
## [1] 1220
## [1] 21
## [1] 1231
## [1] 22
## [1] 1242
## [1] 23
## [1] 1253
## [1] 24
## [1] 1264
## [1] 25
## [1] 1275
## [1] 26
## [1] 1286
## [1] 27
## [1] 1297
## [1] 28
## [1] 1308
## [1] 29
## [1] 1319
## [1] 30
## [1] 1330
## [1] 31
## [1] 1341
## [1] 32
## [1] 1352
## [1] 33
## [1] 1363
## [1] 34
## [1] 1374
## [1] 35
## [1] 1385
## [1] 36
## [1] 1396
## [1] 37
## [1] 1407
## [1] 38
## [1] 1418
## [1] 39
## [1] 1429
## [1] 40
## [1] 1440
## [1] 41
## [1] 1451
## [1] 42
## [1] 1462
## [1] 43
## [1] 1473
## [1] 44
## [1] 1484
## [1] 45
## [1] 1495
## [1] 46
## [1] 1506
## [1] 47
## [1] 1517
## [1] 48
## [1] 1528
## [1] 49
## [1] 1539
## [1] 50
## [1] 1550
## [1] 51
## [1] 1561
## [1] 52
## [1] 1572
## [1] 53
## [1] 1583
## [1] 54
## [1] 1594
## [1] 55
## [1] 1605
## [1] 56
## [1] 1616
## [1] 57
## [1] 1627
## [1] 58
## [1] 1638
## [1] 59
## [1] 1649
## [1] 60
## [1] 1660
## [1] 61
## [1] 1671
## [1] 62
## [1] 1682
## [1] 63
## [1] 1693
## [1] 64
## [1] 1704
## [1] 65
## [1] 1715
## [1] 66
## [1] 1726
## [1] 67
## [1] 1737
## [1] 68
## [1] 1748
## [1] 69
## [1] 1759
## [1] 70
## [1] 1770
## [1] 71
## [1] 1781
## [1] 72
## [1] 1792
## [1] 73
## [1] 1803
## [1] 74
## [1] 1814
## [1] 75
## [1] 1825
## [1] 76
## [1] 1836
## [1] 77
## [1] 1847
## [1] 78
## [1] 1858
## [1] 79
## [1] 1869
## [1] 80
## [1] 1880
## [1] 81
## [1] 1891
## [1] 82
## [1] 1902
## [1] 83
## [1] 1913
## [1] 84
## [1] 1924
## [1] 85
## [1] 1935
## [1] 86
## [1] 1946
## [1] 87
## [1] 1957
## [1] 88
## [1] 1968
## [1] 89
## [1] 1979
## [1] 90
## [1] 1990
## [1] 91
## [1] 2001
## [1] 92
## [1] 2012
## [1] 93
## [1] 2023
## [1] 94
## [1] 2034
## [1] 95
## [1] 2045
## [1] 96
## [1] 2056
## [1] 97
## [1] 2067
## [1] 98
## [1] 2078
## [1] 99
## [1] 2089
## [1] 100
## [1] 2100
3.3.5.3.2.1 Measures of model performance train
  • Accuracy
# acc
acc_trees <- acc_tcoxplus_train[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.9944737
median(acc_trees)
## [1] 1
sd(acc_trees)
## [1] 0.01140351
acc_svm <- acc_tcoxplus_train[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.8486842
median(acc_svm)
## [1] 0.8421053
sd(acc_svm)
## [1] 0.0351624
acc_svmR <- acc_tcoxplus_train[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.7589474
median(acc_svmR)
## [1] 0.8421053
sd(acc_svmR)
## [1] 0.1467214
acc_logs <- acc_tcoxplus_train[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 1
median(acc_logs)
## [1] 1
sd(acc_logs)
## [1] 0
acc_rf <- acc_tcoxplus_train[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.9978947
median(acc_rf)
## [1] 1
sd(acc_rf)
## [1] 0.007175261
  • auc
# auc
auc_trees <- auc_tcoxplus_train[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.9945518
median(auc_trees)
## [1] 1
sd(auc_trees)
## [1] 0.01133615
auc_svm <- auc_tcoxplus_train[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.8316667
median(auc_svm)
## [1] 0.8235294
sd(auc_svm)
## [1] 0.03907989
auc_svmR <- auc_tcoxplus_train[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.7307003
median(auc_svmR)
## [1] 0.8235294
sd(auc_svmR)
## [1] 0.1640102
auc_logs <- auc_tcoxplus_train[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 1
median(auc_logs)
## [1] 1
sd(auc_logs)
## [1] 0
auc_rf <- auc_tcoxplus_train[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.9976471
median(auc_rf)
## [1] 1
sd(auc_rf)
## [1] 0.00801941
  • Misclassification
# miscl
miscl_trees <- miscl_tcoxplus_train[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 0.21
median(miscl_trees)
## [1] 0
sd(miscl_trees)
## [1] 0.4333333
miscl_svm <- miscl_tcoxplus_train[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 5.75
median(miscl_svm)
## [1] 6
sd(miscl_svm)
## [1] 1.336171
miscl_svmR <- miscl_tcoxplus_train[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 9.16
median(miscl_svmR)
## [1] 6
sd(miscl_svmR)
## [1] 5.575415
miscl_logs <- miscl_tcoxplus_train[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 0
median(miscl_logs)
## [1] 0
sd(miscl_logs)
## [1] 0
miscl_rf <- miscl_tcoxplus_train[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 0.08
median(miscl_rf)
## [1] 0
sd(miscl_rf)
## [1] 0.2726599
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_tcoxplus_train[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.9952941
median(sensitivity_trees)
## [1] 1
sd(sensitivity_trees)
## [1] 0.01603882
sensitivity_svm <- sensitivity_tcoxplus_train[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.67
median(sensitivity_svm)
## [1] 0.6470588
sd(sensitivity_svm)
## [1] 0.0788292
sensitivity_svmR <- sensitivity_tcoxplus_train[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.4623529
median(sensitivity_svmR)
## [1] 0.6470588
sd(sensitivity_svmR)
## [1] 0.3284235
sensitivity_logs <- sensitivity_tcoxplus_train[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 1
median(sensitivity_logs)
## [1] 1
sd(sensitivity_logs)
## [1] 0
sensitivity_rf <- sensitivity_tcoxplus_train[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.9952941
median(sensitivity_rf)
## [1] 1
sd(sensitivity_rf)
## [1] 0.01603882
  • Specificity
# specificity
specificity_trees <- specificity_tcoxplus_train[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.9938095
median(specificity_trees)
## [1] 1
sd(specificity_trees)
## [1] 0.01609513
specificity_svm <- specificity_tcoxplus_train[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9933333
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.01793274
specificity_svmR <- specificity_tcoxplus_train[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.9990476
median(specificity_svmR)
## [1] 1
sd(specificity_svmR)
## [1] 0.00952381
specificity_logs <- specificity_tcoxplus_train[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 1
median(specificity_logs)
## [1] 1
sd(specificity_logs)
## [1] 0
specificity_rf <- specificity_tcoxplus_train[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 1
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0
  • False Neg
# fneg
fneg_trees <- fneg_tcoxplus_train[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 0.08
median(fneg_trees)
## [1] 0
sd(fneg_trees)
## [1] 0.2726599
fneg_svm <- fneg_tcoxplus_train[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 5.61
median(fneg_svm)
## [1] 6
sd(fneg_svm)
## [1] 1.340096
fneg_svmR <- fneg_tcoxplus_train[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 9.14
median(fneg_svmR)
## [1] 6
sd(fneg_svmR)
## [1] 5.5832
fneg_logs <- fneg_tcoxplus_train[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 0
median(fneg_logs)
## [1] 0
sd(fneg_logs)
## [1] 0
fneg_rf <- fneg_tcoxplus_train[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 0.08
median(fneg_rf)
## [1] 0
sd(fneg_rf)
## [1] 0.2726599
3.3.5.3.2.2 Measures of model performance test
  • Accuracy
# acc
acc_trees <- acc_tcoxplus[1,]
#hist(acc_trees)
mean(acc_trees)
## [1] 0.6747059
median(acc_trees)
## [1] 0.6470588
sd(acc_trees)
## [1] 0.1107468
acc_svm <- acc_tcoxplus[2,]
#hist(acc_svm)
mean(acc_svm)
## [1] 0.7152941
median(acc_svm)
## [1] 0.7058824
sd(acc_svm)
## [1] 0.09224216
acc_svmR <- acc_tcoxplus[3,]
#hist(acc_svmR)
mean(acc_svmR)
## [1] 0.6041176
median(acc_svmR)
## [1] 0.5882353
sd(acc_svmR)
## [1] 0.1086439
acc_logs <- acc_tcoxplus[4,]
#hist(acc_logs)
mean(acc_logs)
## [1] 0.6235294
median(acc_logs)
## [1] 0.6470588
sd(acc_logs)
## [1] 0.09605842
acc_rf <- acc_tcoxplus[5,]
#hist(acc_rf)
mean(acc_rf)
## [1] 0.7376471
median(acc_rf)
## [1] 0.7647059
sd(acc_rf)
## [1] 0.0745098
  • auc
# auc
auc_trees <- auc_tcoxplus[1,]
#hist(auc_trees)
mean(auc_trees)
## [1] 0.6727083
median(auc_trees)
## [1] 0.6458333
sd(auc_trees)
## [1] 0.1080286
auc_svm <- auc_tcoxplus[2,]
#hist(auc_svm)
mean(auc_svm)
## [1] 0.7053472
median(auc_svm)
## [1] 0.6875
sd(auc_svm)
## [1] 0.08440352
auc_svmR <- auc_tcoxplus[3,]
#hist(auc_svmR)
mean(auc_svmR)
## [1] 0.5975694
median(auc_svmR)
## [1] 0.5833333
sd(auc_svmR)
## [1] 0.1069533
auc_logs <- auc_tcoxplus[4,]
#hist(auc_logs)
mean(auc_logs)
## [1] 0.6215278
median(auc_logs)
## [1] 0.625
sd(auc_logs)
## [1] 0.09167528
auc_rf <- auc_tcoxplus[5,]
#hist(auc_rf)
mean(auc_rf)
## [1] 0.7248611
median(auc_rf)
## [1] 0.75
sd(auc_rf)
## [1] 0.07620552
  • Misclassification
# miscl
miscl_trees <- miscl_tcoxplus[1,]
#hist(miscl_trees)
mean(miscl_trees)
## [1] 5.53
median(miscl_trees)
## [1] 6
sd(miscl_trees)
## [1] 1.882696
miscl_svm <- miscl_tcoxplus[2,]
#hist(miscl_svm)
mean(miscl_svm)
## [1] 4.84
median(miscl_svm)
## [1] 5
sd(miscl_svm)
## [1] 1.568117
miscl_svmR <- miscl_tcoxplus[3,]
#hist(miscl_svmR)
mean(miscl_svmR)
## [1] 6.73
median(miscl_svmR)
## [1] 7
sd(miscl_svmR)
## [1] 1.846947
miscl_logs <- miscl_tcoxplus[4,]
#hist(miscl_logs)
mean(miscl_logs)
## [1] 6.4
median(miscl_logs)
## [1] 6
sd(miscl_logs)
## [1] 1.632993
miscl_rf <- miscl_tcoxplus[5,]
#hist(miscl_rf)
mean(miscl_rf)
## [1] 4.46
median(miscl_rf)
## [1] 4
sd(miscl_rf)
## [1] 1.266667
  • Sensitivity
# sensitivity
sensitivity_trees <- sensitivity_tcoxplus[1,]
#hist(sensitivity_trees)
mean(sensitivity_trees)
## [1] 0.59625
median(sensitivity_trees)
## [1] 0.625
sd(sensitivity_trees)
## [1] 0.1870618
sensitivity_svm <- sensitivity_tcoxplus[2,]
#hist(sensitivity_svm)
mean(sensitivity_svm)
## [1] 0.45875
median(sensitivity_svm)
## [1] 0.5
sd(sensitivity_svm)
## [1] 0.1569668
sensitivity_svmR <- sensitivity_tcoxplus[3,]
#hist(sensitivity_svmR)
mean(sensitivity_svmR)
## [1] 0.37375
median(sensitivity_svmR)
## [1] 0.5
sd(sensitivity_svmR)
## [1] 0.2823149
sensitivity_logs <- sensitivity_tcoxplus[4,]
#hist(sensitivity_logs)
mean(sensitivity_logs)
## [1] 0.525
median(sensitivity_logs)
## [1] 0.5
sd(sensitivity_logs)
## [1] 0.1820548
sensitivity_rf <- sensitivity_tcoxplus[5,]
#hist(sensitivity_rf)
mean(sensitivity_rf)
## [1] 0.5075
median(sensitivity_rf)
## [1] 0.5
sd(sensitivity_rf)
## [1] 0.1419338
  • Specificity
# specificity
specificity_trees <- specificity_tcoxplus[1,]
#hist(specificity_trees)
mean(specificity_trees)
## [1] 0.7444444
median(specificity_trees)
## [1] 0.7777778
sd(specificity_trees)
## [1] 0.1591065
specificity_svm <- specificity_tcoxplus[2,]
#hist(specificity_svm)
mean(specificity_svm)
## [1] 0.9433333
median(specificity_svm)
## [1] 1
sd(specificity_svm)
## [1] 0.1029494
specificity_svmR <- specificity_tcoxplus[3,]
#hist(specificity_svmR)
mean(specificity_svmR)
## [1] 0.8088889
median(specificity_svmR)
## [1] 0.8888889
sd(specificity_svmR)
## [1] 0.2136856
specificity_logs <- specificity_tcoxplus[4,]
#hist(specificity_logs)
mean(specificity_logs)
## [1] 0.7111111
median(specificity_logs)
## [1] 0.6666667
sd(specificity_logs)
## [1] 0.1729998
specificity_rf <- specificity_tcoxplus[5,]
#hist(specificity_rf)
mean(specificity_rf)
## [1] 0.9422222
median(specificity_rf)
## [1] 1
sd(specificity_rf)
## [1] 0.1017125
  • False Neg
# fneg
fneg_trees <- fneg_tcoxplus[1,]
#hist(fneg_trees)
mean(fneg_trees)
## [1] 3.23
median(fneg_trees)
## [1] 3
sd(fneg_trees)
## [1] 1.496494
fneg_svm <- fneg_tcoxplus[2,]
#hist(fneg_svm)
mean(fneg_svm)
## [1] 4.33
median(fneg_svm)
## [1] 4
sd(fneg_svm)
## [1] 1.255734
fneg_svmR <- fneg_tcoxplus[3,]
#hist(fneg_svmR)
mean(fneg_svmR)
## [1] 5.01
median(fneg_svmR)
## [1] 4
sd(fneg_svmR)
## [1] 2.258519
fneg_logs <- fneg_tcoxplus[4,]
#hist(fneg_logs)
mean(fneg_logs)
## [1] 3.8
median(fneg_logs)
## [1] 4
sd(fneg_logs)
## [1] 1.456438
fneg_rf <- fneg_tcoxplus[5,]
#hist(fneg_rf)
mean(fneg_rf)
## [1] 3.94
median(fneg_rf)
## [1] 4
sd(fneg_rf)
## [1] 1.13547
# save results

#save.image("~/results3_2010_final.RData")

3.3.6 Data final

dt3 <- acc[1,]
dt_en3 <- acc_enplus[1,]
dt_iTwiner3 <- acc_tcoxplus[1,]
#dt_hub <- acc_hubplus[1,]

acc_dt3 <- as.data.frame(c(dt3,dt_en3, dt_iTwiner3
                           #,dt_hub
))
colnames(acc_dt3) <- "acc"
#acc_dt$group <- "HUB + DT"
#acc_dt$group[1:100] <- "DT"
acc_dt3$group <- "DT"
acc_dt3$group[101:200] <- "EN + DT"
acc_dt3$group[201:300] <- "iTwiner + DT"
acc_dt3<- acc_dt3 %>% mutate_if(is.character,factor)
# acc_dt$group <- ordered(acc_dt$group, levels = c("DT", "EN + DT","iTwiner + DT", "HUB + DT"))
acc_dt3$group <- ordered(acc_dt3$group, levels = c("DT", "EN + DT","iTwiner + DT"))

acc_dt3$dataset <- "DATASET3"



svmL3 <- acc[2,]
svmL_en3 <- acc_enplus[2,]
svmL_iTwiner3 <- acc_tcoxplus[2,]

acc_svmL3 <- as.data.frame(c(svmL3,svmL_en3, svmL_iTwiner3
                             #,svmL_hub
))
colnames(acc_svmL3) <- "acc"
acc_svmL3$group <- "svmL"
acc_svmL3$group[101:200] <- "EN + svmL"
acc_svmL3$group[201:300] <- "iTwiner + svmL"
acc_svmL3<- acc_svmL3 %>% mutate_if(is.character,factor)
acc_svmL3$group <- ordered(acc_svmL3$group, levels = c("svmL", "EN + svmL","iTwiner + svmL"))

acc_svmL3$dataset <- "DATASET3"




svmR3 <- acc[3,]
svmR_en3 <- acc_enplus[3,]
svmR_iTwiner3 <- acc_tcoxplus[3,]

acc_svmR3 <- as.data.frame(c(svmR3,svmR_en3, svmR_iTwiner3
                             #,svmR_hub
))
colnames(acc_svmR3) <- "acc"
acc_svmR3$group <- "svmR"
acc_svmR3$group[101:200] <- "EN + svmR"
acc_svmR3$group[201:300] <- "iTwiner + svmR"
acc_svmR3<- acc_svmR3 %>% mutate_if(is.character,factor)
acc_svmR3$group <- ordered(acc_svmR3$group, levels = c("svmR", "EN + svmR","iTwiner + svmR"))

acc_svmR3$dataset <- "DATASET3"






logist3 <- acc[4,]
logist_en3 <- acc_enplus[4,]
logist_iTwiner3 <- acc_tcoxplus[4,]

acc_logist3 <- as.data.frame(c(logist3,logist_en3, logist_iTwiner3
                               #,logist_hub
))
colnames(acc_logist3) <- "acc"
acc_logist3$group <- "logist"
acc_logist3$group[101:200] <- "EN + logist"
acc_logist3$group[201:300] <- "iTwiner + logist"
acc_logist3<- acc_logist3 %>% mutate_if(is.character,factor)
acc_logist3$group <- ordered(acc_logist3$group, levels = c("logist", "EN + logist","iTwiner + logist"))

acc_logist3$dataset <- "DATASET3"






rf3 <- acc[5,]
rf_en3 <- acc_enplus[5,]
rf_iTwiner3 <- acc_tcoxplus[5,]

acc_rf3 <- as.data.frame(c(rf3,rf_en3, rf_iTwiner3
                           #,rf_hub
))
colnames(acc_rf3) <- "acc"
acc_rf3$group <- "rf"
acc_rf3$group[101:200] <- "EN + rf"
acc_rf3$group[201:300] <- "iTwiner + rf"
acc_rf3<- acc_rf3 %>% mutate_if(is.character,factor)
acc_rf3$group <- ordered(acc_rf3$group, levels = c("rf", "EN + rf","iTwiner + rf"))

acc_rf3$dataset <- "DATASET3"

4 final results

# Libraries
library(ggplot2)
library(dplyr)
library(forcats)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)
## Loading required package: viridisLite
# Load dataset from github
data_dt <- rbind(acc_dt1,acc_dt2,acc_dt3)


p <- ggplot(data_dt, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="DT",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_svmL <- rbind(acc_svmL1,acc_svmL2,acc_svmL3)


p <- ggplot(data_svmL, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="svmL",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_svmR <- rbind(acc_svmR1,acc_svmR2,acc_svmR3)


p <- ggplot(data_svmR, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="svmR",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_logis <- rbind(acc_logist1,acc_logist2,acc_logist3)


p <- ggplot(data_logis, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="LR",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_rf <- rbind(acc_rf1,acc_rf2,acc_rf3)


p <- ggplot(data_rf, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="RF",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

5 final plots comparing data partitions

# Libraries
library(ggplot2)
library(dplyr)
library(forcats)
library(hrbrthemes)
library(viridis)

# Load dataset from github
data_dt <- rbind(acc_dt1,acc_dt2,acc_dt3)


p <- ggplot(data_dt, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="Decision Trees",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_svmL <- rbind(acc_svmL1,acc_svmL2,acc_svmL3)


p <- ggplot(data_svmL, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="svmL",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_svmR <- rbind(acc_svmR1,acc_svmR2,acc_svmR3)


p <- ggplot(data_svmR, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="svmR",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_logis <- rbind(acc_logist1,acc_logist2,acc_logist3)


p <- ggplot(data_logis, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="Logistic",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

data_rf <- rbind(acc_rf1,acc_rf2,acc_rf3)


p <- ggplot(data_rf, aes(x = group , y = acc, fill = dataset )) + geom_boxplot()

p <- p + theme_bw() 
p + scale_fill_brewer(palette="Pastel1") + labs(title="Random Forest",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold") )

6 Final Stasts

6.1 DT

all decision trees

stat.test <- compare_means(acc ~ dataset, data = data_dt, 
              group.by = "group", paired = F, p.adjust.method = "BH")
stat.test
## # A tibble: 9 × 9
##   group        .y.   group1   group2          p   p.adj p.format p.signif method
##   <ord>        <chr> <chr>    <chr>       <dbl>   <dbl> <chr>    <chr>    <chr> 
## 1 DT           acc   DATASET1 DATASET2 1.91e- 1 2.5e- 1 0.1907   ns       Wilco…
## 2 DT           acc   DATASET1 DATASET3 4.41e- 3 6.6e- 3 0.0044   **       Wilco…
## 3 DT           acc   DATASET2 DATASET3 2.02e- 5 3.6e- 5 2.0e-05  ****     Wilco…
## 4 EN + DT      acc   DATASET1 DATASET2 6.10e- 6 1.4e- 5 6.1e-06  ****     Wilco…
## 5 EN + DT      acc   DATASET1 DATASET3 4.94e- 6 1.4e- 5 4.9e-06  ****     Wilco…
## 6 EN + DT      acc   DATASET2 DATASET3 7.50e- 1 7.5e- 1 0.7499   ns       Wilco…
## 7 iTwiner + DT acc   DATASET1 DATASET2 2.72e-13 2.4e-12 2.7e-13  ****     Wilco…
## 8 iTwiner + DT acc   DATASET1 DATASET3 1.31e-10 5.9e-10 1.3e-10  ****     Wilco…
## 9 iTwiner + DT acc   DATASET2 DATASET3 6.89e- 1 7.5e- 1 0.6890   ns       Wilco…
ggboxplot(data_dt, x = "group", y = "acc",
          fill = "dataset") + scale_fill_brewer(palette="Pastel1") + labs(title="Decision Trees",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold"), legend.title = element_blank())

#p + stat_compare_means(aes(group = dataset))

#stat_compare_means(label.y = 50)     

#aes(group = dataset),comparisons = my_comparisons

DT

dt_stats <- as.data.frame(data_dt[c(1:100,301:400,601:700),])

# Statistical test
stat.test <- compare_means(acc ~ dataset, data = dt_stats, 
              group.by = "group", paired = F, p.adjust.method = "BH")
stat.test
## # A tibble: 3 × 9
##   group .y.   group1   group2           p    p.adj p.format p.signif method  
##   <ord> <chr> <chr>    <chr>        <dbl>    <dbl> <chr>    <chr>    <chr>   
## 1 DT    acc   DATASET1 DATASET2 0.191     0.19     0.1907   ns       Wilcoxon
## 2 DT    acc   DATASET1 DATASET3 0.00441   0.0066   0.0044   **       Wilcoxon
## 3 DT    acc   DATASET2 DATASET3 0.0000202 0.000061 2e-05    ****     Wilcoxon
# Box plot
my_comparisons <- list( c("DATASET1", "DATASET2"), c("DATASET1", "DATASET3"), c("DATASET2", "DATASET3"))

ggboxplot(dt_stats, x = "dataset", y = "acc", fill = "dataset")  + scale_fill_brewer(palette="Pastel1") + labs(title="Decision Trees",x="DT", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold"),legend.position = "none") + stat_compare_means(comparisons = my_comparisons,label = "p.signif")

DT + EN

dt_en_stats <- as.data.frame(data_dt[c(101:200,401:500,701:800),])

# Statistical test
stat.test <- compare_means(acc ~ dataset, data = dt_en_stats, 
              group.by = "group", paired = F, p.adjust.method = "BH")
stat.test
## # A tibble: 3 × 9
##   group   .y.   group1   group2            p     p.adj p.format p.signif method 
##   <ord>   <chr> <chr>    <chr>         <dbl>     <dbl> <chr>    <chr>    <chr>  
## 1 EN + DT acc   DATASET1 DATASET2 0.00000610 0.0000091 6.1e-06  ****     Wilcox…
## 2 EN + DT acc   DATASET1 DATASET3 0.00000494 0.0000091 4.9e-06  ****     Wilcox…
## 3 EN + DT acc   DATASET2 DATASET3 0.750      0.75      0.75     ns       Wilcox…
# Box plot
my_comparisons <- list( c("DATASET1", "DATASET2"), c("DATASET1", "DATASET3"), c("DATASET2", "DATASET3"))

ggboxplot(dt_en_stats, x = "dataset", y = "acc", fill = "dataset")  + scale_fill_brewer(palette="Pastel1") + labs(title="Decision Trees",x="DT + EN", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold"),legend.position = "none") + stat_compare_means(comparisons = my_comparisons,label = "p.signif")

DT + iTW

dt_itw_stats <- as.data.frame(data_dt[c(201:300,501:600,801:900),])

# Statistical test
stat.test <- compare_means(acc ~ dataset, data = dt_itw_stats, 
              group.by = "group", paired = F, p.adjust.method = "BH")
stat.test
## # A tibble: 3 × 9
##   group        .y.   group1   group2          p   p.adj p.format p.signif method
##   <ord>        <chr> <chr>    <chr>       <dbl>   <dbl> <chr>    <chr>    <chr> 
## 1 iTwiner + DT acc   DATASET1 DATASET2 2.72e-13 8.1e-13 2.7e-13  ****     Wilco…
## 2 iTwiner + DT acc   DATASET1 DATASET3 1.31e-10 2  e-10 1.3e-10  ****     Wilco…
## 3 iTwiner + DT acc   DATASET2 DATASET3 6.89e- 1 6.9e- 1 0.69     ns       Wilco…
# Box plot
my_comparisons <- list( c("DATASET1", "DATASET2"), c("DATASET1", "DATASET3"), c("DATASET2", "DATASET3"))

ggboxplot(dt_itw_stats, x = "dataset", y = "acc", fill = "dataset")  + scale_fill_brewer(palette="Pastel1") + labs(title="Decision Trees",x="DT + iTW", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold"),legend.position = "none") + stat_compare_means(comparisons = my_comparisons,label = "p.signif")

6.2 svmL

all svmL

stat.test <- compare_means(acc ~ dataset, data = data_svmL, 
              group.by = "group", paired = F, p.adjust.method = "BH")
stat.test
## # A tibble: 9 × 9
##   group          .y.   group1   group2        p   p.adj p.format p.signif method
##   <ord>          <chr> <chr>    <chr>     <dbl>   <dbl> <chr>    <chr>    <chr> 
## 1 svmL           acc   DATASET1 DATAS… 5.09e- 2 9.2e- 2 0.051    ns       Wilco…
## 2 svmL           acc   DATASET1 DATAS… 3.15e- 1 4.7e- 1 0.315    ns       Wilco…
## 3 svmL           acc   DATASET2 DATAS… 1.05e- 3 2.4e- 3 0.001    **       Wilco…
## 4 EN + svmL      acc   DATASET1 DATAS… 6.46e- 1 7.5e- 1 0.646    ns       Wilco…
## 5 EN + svmL      acc   DATASET1 DATAS… 9.30e- 1 9.3e- 1 0.930    ns       Wilco…
## 6 EN + svmL      acc   DATASET2 DATAS… 6.66e- 1 7.5e- 1 0.666    ns       Wilco…
## 7 iTwiner + svmL acc   DATASET1 DATAS… 1.27e- 9 5.7e- 9 1.3e-09  ****     Wilco…
## 8 iTwiner + svmL acc   DATASET1 DATAS… 4.70e-20 4.2e-19 < 2e-16  ****     Wilco…
## 9 iTwiner + svmL acc   DATASET2 DATAS… 2.82e- 6 8.5e- 6 2.8e-06  ****     Wilco…
ggboxplot(data_svmL, x = "group", y = "acc",
          fill = "dataset") + scale_fill_brewer(palette="Pastel1") + labs(title="svmL",x="Method", y = "Acc") + theme(plot.title = element_text(size = 16, face = "bold.italic", hjust = 0.5),  axis.text = element_text(size = 12), axis.title=element_text(size=14,face="bold"), legend.title = element_blank())

6.3 svmR

svmR

svmR_stats <- as.data.frame(data_svmR[c(1:100,301:400,601:700),c(1,3)])

colnames(svmR_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = svmR_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 55.892, df = 2, p-value = 7.297e-13
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(svmR_stats$acc, svmR_stats$group,
                            p.adjust.method = "BH")

res$p.value
##              DATASET1     DATASET2
## DATASET2 2.075462e-02           NA
## DATASET3 6.533107e-13 6.701501e-07

svmR + EN

svmR_en_stats <- as.data.frame(data_svmR[c(101:200,401:500,701:800),c(1,3)])

colnames(svmR_en_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = svmR_en_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 6.1573, df = 2, p-value = 0.04602
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(svmR_en_stats$acc, svmR_en_stats$group,
                            p.adjust.method = "BH")

res$p.value
##           DATASET1  DATASET2
## DATASET2 0.7995923        NA
## DATASET3 0.0731924 0.0605481

svmR + iTW

svmR_itw_stats <- as.data.frame(data_svmR[c(201:300,501:600,801:900),c(1,3)])

colnames(svmR_itw_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = svmR_itw_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 124.65, df = 2, p-value < 2.2e-16
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(svmR_itw_stats$acc, svmR_itw_stats$group,
                            p.adjust.method = "BH")

res$p.value
##              DATASET1     DATASET2
## DATASET2 4.036850e-09           NA
## DATASET3 3.273268e-24 2.028999e-12

6.4 Logis

logis

logis_stats <- as.data.frame(data_logis[c(1:100,301:400,601:700),c(1,3)])

colnames(logis_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = logis_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 9.591, df = 2, p-value = 0.008267
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(logis_stats$acc, logis_stats$group,
                            p.adjust.method = "BH")

res$p.value
##            DATASET1  DATASET2
## DATASET2 0.00696590        NA
## DATASET3 0.05380181 0.4193445

logis + EN

logis_en_stats <- as.data.frame(data_logis[c(101:200,401:500,701:800),c(1,3)])

colnames(logis_en_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = logis_en_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 12.276, df = 2, p-value = 0.002159
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(logis_en_stats$acc, logis_en_stats$group,
                            p.adjust.method = "BH")

res$p.value
##            DATASET1    DATASET2
## DATASET2 0.36443843          NA
## DATASET3 0.02405311 0.001572091

logis + iTW

logis_itw_stats <- as.data.frame(data_logis[c(201:300,501:600,801:900),c(1,3)])

colnames(logis_itw_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = logis_itw_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 61.238, df = 2, p-value = 5.039e-14
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(logis_itw_stats$acc, logis_itw_stats$group,
                            p.adjust.method = "BH")

res$p.value
##              DATASET1    DATASET2
## DATASET2 1.921927e-07          NA
## DATASET3 1.658938e-13 0.007099146

6.5 RF

rf

rf_stats <- as.data.frame(data_rf[c(1:100,301:400,601:700),c(1,3)])

colnames(rf_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = rf_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 30.39, df = 2, p-value = 2.517e-07
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(rf_stats$acc, rf_stats$group,
                            p.adjust.method = "BH")

res$p.value
##              DATASET1    DATASET2
## DATASET2 1.943128e-02          NA
## DATASET3 1.860940e-07 0.001425355

rf + EN

rf_en_stats <- as.data.frame(data_rf[c(101:200,401:500,701:800),c(1,3)])

colnames(rf_en_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = rf_en_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 10.373, df = 2, p-value = 0.005591
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(rf_en_stats$acc, rf_en_stats$group,
                            p.adjust.method = "BH")

res$p.value
##            DATASET1  DATASET2
## DATASET2 0.01629207        NA
## DATASET3 0.01073887 0.4640821

rf + iTW

rf_itw_stats <- as.data.frame(data_rf[c(201:300,501:600,801:900),c(1,3)])

colnames(rf_itw_stats) <- c("acc", "group")

#Kruskal-Wallis test

kruskal.test(acc ~ group, data = rf_itw_stats)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  acc by group
## Kruskal-Wallis chi-squared = 146.65, df = 2, p-value < 2.2e-16
#pairwise.wilcox.test() to calculate pairwise comparisons between group levels with corrections for multiple testing

res <- pairwise.wilcox.test(rf_itw_stats$acc, rf_itw_stats$group,
                            p.adjust.method = "BH")

res$p.value
##              DATASET1     DATASET2
## DATASET2 1.010677e-06           NA
## DATASET3 2.376360e-27 2.485967e-19
save.image("~/paper_final.RData")